def main():
    #langs = ['russian', 'turkish', 'spanish', 'arabic', 'georgian', 'german', 'navajo', 'finnish']
    langs = ['arabic']
    sig_root = '/Users/roeeaharoni/GitHub/sigmorphon2016/'
    for lang in langs:
        train_path = '{0}/data/{1}-task1-train'.format(sig_root, lang)
        test_path = '{0}/data/{1}-task1-dev'.format(sig_root, lang)
        # load train and test data
        (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
        (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
        alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

        # align the words to the inflections, the alignment will later be used by the model
        print 'started aligning'
        train_word_pairs = zip(train_lemmas, train_words)
        test_word_pairs = zip(test_lemmas, test_words)
        align_symbol = '~'

        # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
        train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

        # TODO: align together?
        test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
        # random.shuffle(train_aligned_pairs)
        # for p in train_aligned_pairs[:100]:
        #    generate_template(p)
        print 'finished aligning'
        for i, p in enumerate(test_aligned_pairs):
            print i
            print p[0]
            print p[1] + '\n'
    return
Exemple #2
0
def main():
    #langs = ['russian', 'turkish', 'spanish', 'arabic', 'georgian', 'german', 'navajo', 'finnish']
    langs = ['arabic']
    sig_root = '/Users/roeeaharoni/GitHub/sigmorphon2016/'
    for lang in langs:
        train_path = '{0}/data/{1}-task1-train'.format(sig_root, lang)
        test_path = '{0}/data/{1}-task1-dev'.format(sig_root, lang)
        # load train and test data
        (train_words, train_lemmas,
         train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
        (test_words, test_lemmas,
         test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
        alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
            train_words, train_lemmas, train_feat_dicts)

        # align the words to the inflections, the alignment will later be used by the model
        print 'started aligning'
        train_word_pairs = zip(train_lemmas, train_words)
        test_word_pairs = zip(test_lemmas, test_words)
        align_symbol = '~'

        # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
        train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

        # TODO: align together?
        test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
        # random.shuffle(train_aligned_pairs)
        # for p in train_aligned_pairs[:100]:
        #    generate_template(p)
        print 'finished aligning'
        for i, p in enumerate(test_aligned_pairs):
            print i
            print p[0]
            print p[1] + '\n'
    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization):
    parallelize_training = True
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers,
                    'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE,
                    'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feats = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # cluster the data by inflection type (features)
    train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feats)
    test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feats)

    # factored model: new model per inflection type
    params = []
    for morph_index, morph_type in enumerate(train_morph_to_data_indices):
        params.append([input_dim, hidden_dim, layers, morph_index, morph_type, train_lemmas, train_words, test_lemmas,
                       train_morph_to_data_indices, test_words, test_morph_to_data_indices, alphabet, alphabet_index,
                       inverse_alphabet_index, epochs, optimization, results_file_path])

    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        p.map(train_morph_model_wrapper, params)
        print 'finished training all models'
    else:
        for p in params:
            train_morph_model(*p)

    # evaluate best models
    os.system('python task1_evaluate_best_factored_models.py --cnn-mem 4096 --input={0} --hidden={1} --epochs={2} --layers={3}\
  --optimization={4} {5} {6} {7} {8}'.format(input_dim, hidden_dim, epochs, layers, optimization, train_path, test_path,
                                             results_file_path, sigmorphon_root_dir))
    return
Exemple #4
0
def init_model(dev_path, feat_input_dim, hidden_dim, input_dim, layers,
               results_file_path, test_path, train_path):

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas,
     dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    model_file_name = results_file_path + '_bestmodel.txt'

    # load model and everything else needed for prediction
    initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = hard_attention.load_best_model(
        alphabet, results_file_path, input_dim, hidden_dim, layers,
        feature_alphabet, feat_input_dim, feature_types)

    print 'loaded existing model successfully'
    return (initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn,
            encoder_rrnn, decoder_rnn, alphabet_index, feat_index,
            feature_types, inverse_alphabet_index, dev_words, dev_lemmas,
            dev_feat_dicts)
def init_model(dev_path, feat_input_dim, hidden_dim, input_dim, layers, results_file_path, test_path, train_path):

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    model_file_name = results_file_path + '_bestmodel.txt'

    # load model and everything else needed for prediction
    initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = hard_attention.load_best_model(
        alphabet,
        results_file_path,
        input_dim,
        hidden_dim,
        layers,
        feature_alphabet,
        feat_input_dim,
        feature_types)

    print 'loaded existing model successfully'
    return (initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index,
            feat_index, feature_types, inverse_alphabet_index, dev_words, dev_lemmas, dev_feat_dicts)
Exemple #6
0
def convert_sigmorphon_to_morphtrans(sig_file, morphtrans_file, create_alphabet = True):

    (words, lemmas, feat_dicts) = prepare_sigmorphon_data.load_data(sig_file)
    alphabet, feats = prepare_sigmorphon_data.get_alphabet(words, lemmas, feat_dicts)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    if create_alphabet:
        with codecs.open(morphtrans_file + '.word_alphabet', "w", encoding='utf8') as alphabet_file:
            alphabet_file.write(' '.join([c for c in list(alphabet) if len(c) < 2]) + ' ' + END_WORD + ' '
                                + BEGIN_WORD)

        morph2feats = common.cluster_data_by_morph_type(feat_dicts, feats)
        with codecs.open(morphtrans_file + '.morph_alphabet', "w", encoding='utf8') as alphabet_file:
            alphabet_file.write(' '.join([key for key in morph2feats.keys()]))

    with codecs.open(morphtrans_file, "w", encoding='utf8') as output_file:
        for lemma, word, dict in zip(lemmas, words, feat_dicts):
            # <s> a b g a s k l a p p e </s>|<s> a b g a s k l a p p e </s>|case=nominative:number=singular
            output_file.write(BEGIN_WORD + ' ' + ' '.join(list(lemma)) + ' ' + END_WORD + '|' + BEGIN_WORD + ' ' +
                              ' '.join(list(word)) + ' ' + END_WORD + '|' + get_morph_string(dict, feats) + '\n')
    return
Exemple #7
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # cluster the data by POS type (features)
    train_cluster_to_data_indices = common.cluster_data_by_pos(
        train_feat_dicts)
    test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [
            train_words[i] for i in train_cluster_to_data_indices[cluster_type]
        ]
        if len(train_cluster_words) < 1:
            print 'only ' + str(
                len(train_cluster_words
                    )) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        try:
            test_cluster_lemmas = [
                test_lemmas[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]
            test_cluster_words = [
                test_words[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]
            test_cluster_feat_dicts = [
                test_feat_dicts[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]

            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                str(cluster_index), alphabet, results_file_path, input_dim,
                hidden_dim, layers, feature_alphabet, feat_input_dim,
                feature_types)

            predicted_templates = task1_joint_structured_inflection_feedback_fix.predict_templates(
                best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                alphabet_index, inverse_alphabet_index, test_cluster_lemmas,
                test_cluster_feat_dicts, feat_index, feature_types)

            accuracy = task1_joint_structured_inflection_feedback_fix.evaluate_model(
                predicted_templates,
                test_cluster_lemmas,
                test_cluster_feat_dicts,
                test_cluster_words,
                feature_types,
                print_results=False)
            accuracies.append(accuracy)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                    test_feat_dicts[i], feature_types)
                inflection = task1_joint_structured_inflection_feedback_fix.instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                    inflection)

        except KeyError:
            print 'could not find relevant examples in test data for cluster: ' + cluster_type

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum(
        [accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'
    common.write_results_file_and_evaluate_externally(
        hyper_params, micro_average_accuracy, train_path, test_path,
        results_file_path + suffix, sigmorphon_root_dir, final_results)
Exemple #8
0
def main():

    langs = [
        'russian', 'georgian', 'finnish', 'arabic', 'navajo', 'spanish',
        'turkish', 'german', 'hungarian', 'maltese'
    ]
    for lang in langs:
        task_num = 1
        train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/{0}-task{1}-train'.format(
            lang, str(task_num))
        dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/{0}-task{1}-dev'.format(
            lang, str(task_num))

        if task_num == 1 or task_num == 3:
            (train_targets, train_sources,
             train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
            (test_words, test_lemmas,
             test_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
            alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
                train_targets, train_sources, train_feat_dicts)
            train_cluster_to_data_indices = common.cluster_data_by_pos(
                train_feat_dicts)
            test_cluster_to_data_indices = common.cluster_data_by_pos(
                test_feat_dicts)
            train_morph_to_data_indices = common.cluster_data_by_morph_type(
                train_feat_dicts, feature_types)
            test_morph_to_data_indices = common.cluster_data_by_morph_type(
                test_feat_dicts, feature_types)
        if task_num == 2:
            (train_targets, train_sources, train_target_feat_dicts,
             train_source_feat_dicts) = prepare_sigmorphon_data.load_data(
                 train_path, task=2)
            (test_targets, test_sources, test_target_feat_dicts,
             test_source_feat_dicts) = prepare_sigmorphon_data.load_data(
                 dev_path, task=2)
            alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
                train_targets, train_sources, train_target_feat_dicts,
                train_source_feat_dicts)
            train_cluster_to_data_indices = common.cluster_data_by_pos(
                train_target_feat_dicts)
            test_cluster_to_data_indices = common.cluster_data_by_pos(
                test_target_feat_dicts)
            train_morph_to_data_indices = common.cluster_data_by_morph_type(
                train_target_feat_dicts, feature_types)
            test_morph_to_data_indices = common.cluster_data_by_morph_type(
                test_target_feat_dicts, feature_types)

        train_agg = 0
        for cluster in train_cluster_to_data_indices:
            train_agg += len(train_cluster_to_data_indices[cluster])
            print 'train ' + lang + ' ' + cluster + ' : ' + str(
                len(train_cluster_to_data_indices[cluster])) + ' examples'

        print 'train ' + lang + ' ' + 'agg' + ' : ' + str(
            train_agg) + ' examples'
        dev_agg = 0
        for cluster in test_cluster_to_data_indices:
            dev_agg += len(test_cluster_to_data_indices[cluster])
            print 'dev ' + lang + ' ' + cluster + ' : ' + str(
                len(test_cluster_to_data_indices[cluster])) + ' examples'
        print 'dev ' + lang + ' ' + 'agg' + ' : ' + str(dev_agg) + ' examples'
        print lang + ' train morphs: ' + str(len(train_morph_to_data_indices))
        print lang + ' avg ex. per morph: ' + str(
            sum([len(l) for l in train_morph_to_data_indices.keys()]) /
            float(len(train_morph_to_data_indices)))
        print lang + ' dev morphs: ' + str(len(test_morph_to_data_indices))
        print lang + ' num features: ' + str(len(feature_types))

        for cluster in train_cluster_to_data_indices:
            train_cluster_words = [
                train_targets[i]
                for i in train_cluster_to_data_indices[cluster]
            ]
            train_cluster_lemmas = [
                train_sources[i]
                for i in train_cluster_to_data_indices[cluster]
            ]
            prefix_count, suffix_count, same_count, circumfix_count, other_count, lev_avg, del_avg = get_morpheme_stats(
                train_cluster_words, train_cluster_lemmas)
            print "train {0} {1}    {2} &  {3} & {4} & {5} & {6} & {7:.3f} & {8:.3f}".format(
                lang, cluster, prefix_count, suffix_count, same_count,
                circumfix_count, other_count, lev_avg, del_avg)

        for cluster in train_cluster_to_data_indices:
            print 'train ' + lang + ' ' + cluster + ' : ' + str(
                len(train_cluster_to_data_indices[cluster])) + ' examples'

        prefix_count, suffix_count, same_count, circumfix_count, other_count, lev_avg, del_avg = get_morpheme_stats(
            train_targets, train_sources)
        print "train {0} {1}    {2} &  {3} & {4} & {5} & {6} & {7:.3f} & {8:.3f}".format(
            lang, 'AGG', prefix_count, suffix_count, same_count,
            circumfix_count, other_count, lev_avg, del_avg)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, epochs, layers, optimization):
    parallelize_training = True
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'PATIENCE': MAX_PATIENCE,
        'REGULARIZATION': REGULARIZATION,
        'LEARNING_RATE': LEARNING_RATE
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feats = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # cluster the data by inflection type (features)
    train_morph_to_data_indices = common.cluster_data_by_morph_type(
        train_feat_dicts, feats)
    test_morph_to_data_indices = common.cluster_data_by_morph_type(
        test_feat_dicts, feats)

    # factored model: new model per inflection type
    params = []
    for morph_index, morph_type in enumerate(train_morph_to_data_indices):
        params.append([
            input_dim, hidden_dim, layers, morph_index, morph_type,
            train_lemmas, train_words, test_lemmas,
            train_morph_to_data_indices, test_words,
            test_morph_to_data_indices, alphabet, alphabet_index,
            inverse_alphabet_index, epochs, optimization, results_file_path
        ])

    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        p.map(train_morph_model_wrapper, params)
        print 'finished training all models'
    else:
        for p in params:
            train_morph_model(*p)

    # evaluate best models
    os.system(
        'python task1_evaluate_best_factored_models.py --cnn-mem 4096 --input={0} --hidden={1} --epochs={2} --layers={3}\
  --optimization={4} {5} {6} {7} {8}'.format(input_dim, hidden_dim, epochs,
                                             layers, optimization, train_path,
                                             test_path, results_file_path,
                                             sigmorphon_root_dir))
    return
def main(train_path, dev_path, test_path, results_file_path,
         sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot, override,
         eval_only, ensemble):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'FEAT_INPUT_DIM': feat_input_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'PATIENCE': MAX_PATIENCE,
        'REGULARIZATION': regularization,
        'LEARNING_RATE': learning_rate
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    (dev_words, dev_lemmas,
     dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    model_file_name = results_file_path + '_bestmodel.txt'
    if os.path.isfile(model_file_name) and not override:
        print 'loading existing model from {}'.format(model_file_name)
        model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model(
            alphabet, results_file_path, input_dim, hidden_dim, layers,
            feature_alphabet, feat_input_dim, feature_types)
        print 'loaded existing model successfully'
    else:
        print 'could not find existing model or explicit override was requested. starting training from scratch...'
        model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model(
            alphabet, input_dim, hidden_dim, layers, feature_types,
            feat_input_dim, feature_alphabet)
    if not eval_only:
        # start training
        trained_model, last_epoch, best_epoch = train_model(
            model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas,
            train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts,
            dev_words, alphabet_index, inverse_alphabet_index, epochs,
            optimization, results_file_path, feat_index, feature_types, plot)
        model = trained_model
        print 'last epoch is {}'.format(last_epoch)
        print 'best epoch is {}'.format(best_epoch)
        print 'finished training'
    else:
        print 'skipped training, evaluating on test set...'

    if ensemble:
        predicted_sequences = predict_with_ensemble_majority(
            alphabet, alphabet_index, ensemble, feat_index, feat_input_dim,
            feature_alphabet, feature_types, hidden_dim, input_dim,
            inverse_alphabet_index, layers, test_feat_dicts, test_lemmas,
            test_words)
    else:
        predicted_sequences = predict_sequences(model, decoder_rnn,
                                                encoder_frnn, encoder_rrnn,
                                                alphabet_index,
                                                inverse_alphabet_index,
                                                test_lemmas, test_feat_dicts,
                                                feat_index, feature_types)
    if len(predicted_sequences) > 0:
        # evaluate last model on test
        amount, accuracy = evaluate_model(predicted_sequences,
                                          test_lemmas,
                                          test_feat_dicts,
                                          test_words,
                                          feature_types,
                                          print_results=False)
        print 'initial eval: {}% accuracy'.format(accuracy)

        final_results = {}
        for i in xrange(len(test_lemmas)):
            joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                test_feat_dicts[i], feature_types)
            inflection = predicted_sequences[joint_index]
            final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                ''.join(inflection))

        # evaluate best models
        common.write_results_file_and_evaluate_externally(
            hyper_params, accuracy, train_path, test_path,
            results_file_path + '.external_eval.txt', sigmorphon_root_dir,
            final_results)
    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_target_words, train_source_words, train_target_feat_dicts,
     train_source_feat_dicts) = prepare_sigmorphon_data.load_data(train_path, 2)
    (test_target_words, test_source_words, test_target_feat_dicts,
     test_source_feat_dicts) = prepare_sigmorphon_data.load_data(test_path, 2)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_target_words, train_source_words,
                                                                   train_target_feat_dicts, train_source_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_source_feat_dicts + train_target_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_source_words, train_target_words)
    test_word_pairs = zip(test_source_words, test_target_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    # TODO: do we need to cluster on both source and target feats? 
    #       probably enough to cluster on source here becasue pos will be same
    #       (no derivational morphology in this task)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_source_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_source_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append(
            [input_dim, hidden_dim, layers, cluster_index, cluster_type, train_source_words, train_source_feat_dicts,
             train_target_words, train_target_feat_dicts, test_source_words, test_source_feat_dicts,
             train_cluster_to_data_indices, test_target_words, test_target_feat_dicts,
             test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
             optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index,
             feature_types, feat_input_dim, feature_alphabet, plot])

    if parallelize_training:
        # set maxtasksperchild=1 to free finished processes
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        models = p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        for p in params:
            trained_model, last_epoch = train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system('python task2_evaluate_best_joint_structured_models_blstm_feed_fix.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \
                 --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                      feat_input_dim, epochs,
                                                                                      layers, optimization, train_path,
                                                                                      test_path,
                                                                                      results_file_path,
                                                                                      sigmorphon_root_dir))
    return
def main(train_path, dev_path, test_path, results_path):
    # read morph input files (train+dev)
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)

    merged_train_dev_lemmas = []
    merged_train_dev_words = []
    merged_train_dev_feat_dicts = []

    if dev_path != 'NONE':
        # merge the train and dev files, if dev exists
        merged_train_dev_lemmas += train_lemmas
        merged_train_dev_lemmas += dev_lemmas

        merged_train_dev_words += train_words
        merged_train_dev_words += dev_words

        merged_train_dev_feat_dicts += train_feat_dicts
        merged_train_dev_feat_dicts += dev_feat_dicts

    # TODO: optional - implement data augmentation

    # concatenate feats and characters for input
    tokenized_test_inputs, tokenized_test_outputs = convert_sigmorphon_to_MED_format(test_feat_dicts, test_lemmas, test_words)

    tokenized_train_inputs, tokenized_train_outputs = convert_sigmorphon_to_MED_format(train_feat_dicts, train_lemmas, train_words)

    tokenized_dev_inputs, tokenized_dev_outputs = convert_sigmorphon_to_MED_format(dev_feat_dicts, dev_lemmas, dev_words)

    tokenized_merged_inputs, tokenized_merged_outputs = convert_sigmorphon_to_MED_format(merged_train_dev_feat_dicts,
                                                                                         merged_train_dev_lemmas,
                                                                                         merged_train_dev_words)

    parallel_data = zip(tokenized_train_inputs, tokenized_train_outputs)

    # write input and output files
    train_inputs_file_path, train_outputs_file_path = write_converted_file(results_path,
                                                               tokenized_train_inputs,
                                                               tokenized_train_outputs,
                                                               'train.in',
                                                               'train.out')

    train_inputs_file_path, train_outputs_file_path = write_converted_file(results_path,
                                                                           tokenized_train_inputs,
                                                                           tokenized_train_outputs,
                                                                           'train.in.tok',
                                                                           'train.out.tok')

    test_inputs_file_path, test_outputs_file_path = write_converted_file(results_path,
                                                                           tokenized_test_inputs,
                                                                           tokenized_test_outputs,
                                                                           'test.in',
                                                                           'test.out')

    test_inputs_file_path, test_outputs_file_path = write_converted_file(results_path,
                                                                         tokenized_test_inputs,
                                                                         tokenized_test_outputs,
                                                                         'test.in.tok',
                                                                         'test.out.tok')

    merged_inputs_file_path, merged_outputs_file_path = write_converted_file(results_path,
                                                                         tokenized_merged_inputs,
                                                                         tokenized_merged_outputs,
                                                                         'merged.in',
                                                                         'merged.out')


    merged_inputs_file_path, merged_outputs_file_path = write_converted_file(results_path,
                                                                     tokenized_merged_inputs,
                                                                     tokenized_merged_outputs,
                                                                     'merged.in.tok',
                                                                     'merged.out.tok')

    dev_inputs_file_path, dev_outputs_file_path = write_converted_file(results_path,
                                                                             tokenized_dev_inputs,
                                                                             tokenized_dev_outputs,
                                                                             'dev.in',
                                                                             'dev.out')


    dev_inputs_file_path, dev_outputs_file_path = write_converted_file(results_path,
                                                                       tokenized_dev_inputs,
                                                                       tokenized_dev_outputs,
                                                                       'dev.in.tok',
                                                                       'dev.out.tok')


    # after the above files are created, hacky preprocess them by instantiating the args variables in prepare_data.py to
    # point the created files. only changes in original prepare_data.py code required for that are:

    # args.source = 'train.in'
    # args.target = 'train.out'
    # args.source_dev = 'test.in'
    # args.target_dev = 'test.out'

    # tr_files = ['/Users/roeeaharoni/GitHub/morphological-reinflection/src/machine_translation/data/train.in',
    #             '/Users/roeeaharoni/GitHub/morphological-reinflection/src/machine_translation/data/train.out']

    # change shuf to gshuf on mac

    # blocks search.py - line 102 - add on_unused_input='ignore'

    # eventually, run training script on the preprocessed files by changing those values in configuration.py:
    # bleu_val_freq, val_burn_in, val_set, val_set_grndtruth

    # and then run:
    # python -m machine_translation

    # finally run the script that converts the validation_out.txt file into the sigmorphon format and run evaluation
    sigmorphon_dev_file_path = dev_path
    MED_validation_file_path = './search_model_morph/validation_out.txt'
    output_file_path = './search_model_morph/validation_out.sigmorphon.txt'
    convert_MED_output_to_sigmorphon_format(sigmorphon_dev_file_path, MED_validation_file_path, output_file_path)

    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization, feat_input_dim, nbest):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers,
                    'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'NBEST':nbest}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(
        train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(
        test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # cluster the data by POS type (features)
    train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]]
        if len(train_cluster_words) < 1:
            print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]]
        test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]]
        test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]]

        # load best model
        best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet,
                                                                              results_file_path, input_dim,
                                                                              hidden_dim, layers,
                                                                              feature_alphabet, feat_input_dim,
                                                                              feature_types)

        lang  = train_path.split('/')[-1].replace('-task{0}-train'.format('1'),'')
        if nbest == 1:
            is_nbest = False
            predicted_templates = task1_joint_structured_inflection_blstm_feedback_fix.predict_templates(
                best_model,
                decoder_rnn,
                encoder_frnn, encoder_rrnn,
                alphabet_index,
                inverse_alphabet_index,
                test_cluster_lemmas,
                test_cluster_feat_dicts,
                feat_index,
                feature_types)

            accuracy = task1_joint_structured_inflection_blstm_feedback_fix.evaluate_model(predicted_templates,
                                                                                       test_cluster_lemmas,
                                                                    test_cluster_feat_dicts, test_cluster_words,
                                                                    feature_types, print_results=False)
            accuracies.append(accuracy)
            print '{0} {1} accuracy: {2}'.format(lang, cluster_type, accuracy[1])

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
                inflection = task1_joint_structured_inflection_blstm_feedback_fix.instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection)

            micro_average_accuracy = accuracy[1]

        else:
            is_nbest = True

            predicted_nbset_templates = task1_joint_structured_inflection_blstm_feedback_fix.predict_nbest_templates(
            best_model,
            decoder_rnn,
            encoder_frnn,
            encoder_rrnn,
            alphabet_index,
            inverse_alphabet_index,
            test_cluster_lemmas,
            test_cluster_feat_dicts,
            feat_index,
            feature_types,
            nbest,
            test_cluster_words)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)

                nbest_inflections = []
                templates = [t for (t,p) in predicted_nbset_templates[joint_index]]
                for template in templates:
                    nbest_inflections.append(
                            task1_joint_structured_inflection_blstm_feedback_fix.instantiate_template(
                                template,
                                test_lemmas[i]))
                final_results[i] = (test_lemmas[i], test_feat_dicts[i], nbest_inflections)

            micro_average_accuracy = -1


    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    common.write_results_file(hyper_params,
                              micro_average_accuracy,
                              train_path,
                              test_path,
                              results_file_path + suffix,
                              sigmorphon_root_dir,
                              final_results,
                              is_nbest)
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'dev path =' + str(dev_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)

    # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

    # TODO: align together?
    dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    if not eval_only:
        last_epochs = []
        trained_model, last_epoch = train_model_wrapper(input_dim, hidden_dim, layers, train_lemmas, train_feat_dicts,
                                                        train_words, dev_lemmas, dev_feat_dicts, dev_words,
                                                        alphabet, alphabet_index, inverse_alphabet_index, epochs,
                                                        optimization, results_file_path, train_aligned_pairs,
                                                        dev_aligned_pairs,
                                                        feat_index, feature_types, feat_input_dim, feature_alphabet,
                                                        plot)

        # print when did each model stop
        print 'stopped on epoch {}'.format(last_epoch)

        with open(results_file_path + '.epochs', 'w') as f:
            f.writelines(last_epochs)

        print 'finished training all models'
    else:
        print 'skipped training by request. evaluating best models:'

    # eval on dev
    print '=========DEV EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, dev_feat_dicts, dev_lemmas, dev_path,
                  dev_words, train_path)

    # eval on test
    print '=========TEST EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_path)

    return
Exemple #15
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim,
         nbest):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'NBEST': nbest
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_target_words, train_source_words, train_target_feat_dicts,
     train_source_feat_dicts) = prepare_sigmorphon_data.load_data(
         train_path, 2)
    (test_target_words, test_source_words, test_target_feat_dicts,
     test_source_feat_dicts) = prepare_sigmorphon_data.load_data(test_path, 2)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_target_words, train_source_words, train_target_feat_dicts,
        train_source_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_source_feat_dicts +
                                                   train_target_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # cluster the data by POS type (features)
    # TODO: do we need to cluster on both source and target feats?
    #       probably enough to cluster on source here becasue pos will be same
    #       (no derivational morphology in this task)
    # train_cluster_to_data_indices = common.cluster_data_by_pos(train_source_feat_dicts)
    # test_cluster_to_data_indices = common.cluster_data_by_pos(test_source_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    # no clustering, single model
    train_cluster_to_data_indices = common.get_single_pseudo_cluster(
        train_source_feat_dicts)
    test_cluster_to_data_indices = common.get_single_pseudo_cluster(
        test_source_feat_dicts)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_target_words = [
            train_target_words[i]
            for i in train_cluster_to_data_indices[cluster_type]
        ]
        if len(train_cluster_target_words) < 1:
            print 'only ' + str(
                len(train_cluster_target_words
                    )) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_target_words)) + ' examples'

        # test best model

        test_cluster_source_words = [
            test_source_words[i]
            for i in test_cluster_to_data_indices[cluster_type]
        ]
        test_cluster_target_words = [
            test_target_words[i]
            for i in test_cluster_to_data_indices[cluster_type]
        ]
        test_cluster_source_feat_dicts = [
            test_source_feat_dicts[i]
            for i in test_cluster_to_data_indices[cluster_type]
        ]
        test_cluster_target_feat_dicts = [
            test_target_feat_dicts[i]
            for i in test_cluster_to_data_indices[cluster_type]
        ]

        # load best model
        best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
            str(cluster_index), alphabet, results_file_path, input_dim,
            hidden_dim, layers, feature_alphabet, feat_input_dim,
            feature_types)

        lang = train_path.split('/')[-1].replace('-task{0}-train'.format('1'),
                                                 '')

        # handle greedy prediction
        if nbest == 1:
            is_nbest = False
            predicted_templates = task2_ms2s.predict_templates(
                best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                alphabet_index, inverse_alphabet_index,
                test_cluster_source_words, test_cluster_source_feat_dicts,
                test_cluster_target_feat_dicts, feat_index, feature_types)

            accuracy = task2_ms2s.evaluate_model(
                predicted_templates,
                test_cluster_source_words,
                test_cluster_source_feat_dicts,
                test_cluster_target_words,
                test_cluster_target_feat_dicts,
                feature_types,
                print_results=False)
            accuracies.append(accuracy)
            print '{0} {1} accuracy: {2}'.format(lang, cluster_type,
                                                 accuracy[1])

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_source_words[i] + ':' + common.get_morph_string(test_source_feat_dicts[i],
                                                                                   feature_types) \
                              + ':' + common.get_morph_string(test_target_feat_dicts[i], feature_types)
                inflection = task2_ms2s.instantiate_template(
                    predicted_templates[joint_index], test_source_words[i])
                final_results[i] = (test_source_words[i],
                                    test_source_feat_dicts[i], inflection,
                                    test_target_feat_dicts[i])

            micro_average_accuracy = accuracy[1]

        # handle n-best prediction
        else:
            is_nbest = True

            predicted_nbset_templates = task2_ms2s.predict_nbest_templates(
                best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                alphabet_index, inverse_alphabet_index,
                test_cluster_source_words, test_cluster_source_feat_dicts,
                test_cluster_target_feat_dicts, feat_index, feature_types,
                nbest, test_cluster_target_words)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_source_words[i] + ':' + common.get_morph_string(test_source_feat_dicts[i],
                                                                                   feature_types) \
                              + ':' + common.get_morph_string(test_target_feat_dicts[i], feature_types)

                nbest_inflections = []
                templates = [
                    t for (t, p) in predicted_nbset_templates[joint_index]
                ]
                for template in templates:
                    nbest_inflections.append(
                        task2_ms2s.instantiate_template(
                            template, test_source_words[i]))
                final_results[i] = (test_source_words[i],
                                    test_source_feat_dicts[i],
                                    nbest_inflections,
                                    test_target_feat_dicts[i])

            micro_average_accuracy = -1

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    task2_joint_structured_inflection.write_results_file(
        hyper_params, micro_average_accuracy, train_path, test_path,
        results_file_path + suffix, sigmorphon_root_dir, final_results,
        is_nbest)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION,
                    'LEARNING_RATE': LEARNING_RATE}
    parallelize_training = True
    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # cluster the data by inflection type (features) - used for sanity check
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # generate params for each model
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words,
                       test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, feat_index, feature_types, feat_input_dim, feature_alphabet])

    # train models in parallel or in loop
    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        for p in params:
            train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system('python task1_evaluate_best_joint_models.py --cnn-mem 4096 --input={0} --hidden={1} --input-feat {2} \
              --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                   feat_input_dim, epochs, layers,
                                                                                   optimization, train_path, test_path,
                                                                                   results_file_path,
                                                                                   sigmorphon_root_dir))
    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words,
                       test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index,
                       feature_types, feat_input_dim, feature_alphabet, plot])

    if parallelize_training:
        # set maxtasksperchild=1 to free finished processes
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        models = p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        for p in params:
            trained_model, last_epoch = train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system('python task1_evaluate_best_joint_structured_models_blstm_feed_fix.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \
                 --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                      feat_input_dim, epochs,
                                                                                      layers, optimization, train_path,
                                                                                      test_path,
                                                                                      results_file_path,
                                                                                      sigmorphon_root_dir))
    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'FEAT_INPUT_DIM': feat_input_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'PATIENCE': MAX_PATIENCE,
        'REGULARIZATION': REGULARIZATION,
        'LEARNING_RATE': LEARNING_RATE
    }
    parallelize_training = True
    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # cluster the data by inflection type (features) - used for sanity check
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # generate params for each model
    params = []
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):
        params.append([
            input_dim, hidden_dim, layers, cluster_index, cluster_type,
            train_lemmas, train_feat_dicts, train_words, test_lemmas,
            test_feat_dicts, train_cluster_to_data_indices, test_words,
            test_cluster_to_data_indices, alphabet, alphabet_index,
            inverse_alphabet_index, epochs, optimization, results_file_path,
            feat_index, feature_types, feat_input_dim, feature_alphabet
        ])

    # train models in parallel or in loop
    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(
            len(train_cluster_to_data_indices))
        p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(
            len(train_cluster_to_data_indices))
        for p in params:
            train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system(
        'python task1_evaluate_best_joint_models.py --cnn-mem 4096 --input={0} --hidden={1} --input-feat {2} \
              --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.
        format(input_dim, hidden_dim, feat_input_dim, epochs, layers,
               optimization, train_path, test_path, results_file_path,
               sigmorphon_root_dir))
    return
def main():
    # train_path = '../data/heb/hebrew-task1-train'
    # dev_path = '../data/heb/hebrew-task1-dev'
    # test_path = '../data/heb/hebrew-task1-test'

    # train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/german-task1-train'
    # dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/german-task1-dev'
    # test_path = '../biu/gold/german-task1-test'

    train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/finnish-task1-train'
    dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/finnish-task1-dev'
    test_path = '../biu/gold/finnish-task1-test'

    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas,
     dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)
    align_symbol = '~'

    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    index2template = {}
    for i, aligned_pair in enumerate(train_aligned_pairs):
        template = task1_single_ms2s.generate_template_from_alignment(
            aligned_pair)
        index2template[i] = template

    dev_handled = 0
    print 'now trying all templates on dev'
    for pair in dev_word_pairs:
        lemma, inflection = pair
        for template in index2template.values():
            prediction = task1_single_ms2s.instantiate_template(
                template, lemma)
            if prediction == inflection:
                dev_handled += 1
                break

    print "train templates handled {} examples in dev out of {}, {}%".format(
        dev_handled, len(dev_lemmas),
        float(dev_handled) / len(dev_lemmas) * 100)

    test_handled = 0
    print 'now trying all templates on test'
    for pair in test_word_pairs:
        lemma, inflection = pair
        for template in index2template.values():
            prediction = task1_single_ms2s.instantiate_template(
                template, lemma)
            if prediction == inflection:
                test_handled += 1
                break

    print "train templates handled {} examples in test out of {}, {}%".format(
        test_handled, len(test_lemmas),
        float(test_handled) / len(test_lemmas) * 100)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization):
    parallelize_training = False
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers,
                    'CHAR_DROPOUT_PROB': CHAR_DROPOUT_PROB, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION,
                    'LEARNING_RATE': LEARNING_RATE}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feats = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # cluster the data by inflection type (features)
    train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feats)
    test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feats)

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # factored model: new model per inflection type. create input for each model and then parallelize or run in loop.
    params = []
    for morph_index, morph_type in enumerate(train_morph_to_data_indices):
        params.append([input_dim, hidden_dim, layers, morph_index, morph_type, train_lemmas, train_words, test_lemmas,
                       train_morph_to_data_indices, test_words, test_morph_to_data_indices, alphabet, alphabet_index,
                       inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs,
                       test_aligned_pairs])

    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        p.map(train_morph_model, params)
        print 'finished training all models'
    else:
        for p in params:
            if not check_if_exists(p[-3], p[3]):
                train_morph_model(*p)
            else:
                print 'model ' + str(p[3]) + ' exists, skipping...'

    # evaluate best models
    os.system('python task1_evaluate_best_factored_structured_models.py --cnn-mem 8192 --input={0} --hidden={1} --epochs={2} \
              --layers={3} --optimization={4} {5} {6} {7} {8}'.format(input_dim, hidden_dim, epochs, layers,
                                                                      optimization, train_path, test_path,
                                                                      results_file_path,
                                                                      sigmorphon_root_dir))
    return
Exemple #21
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim,
         ensemble):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'FEAT_INPUT_DIM': feat_input_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # cluster the data by POS type (features)
    train_cluster_to_data_indices = common.cluster_data_by_pos(
        train_feat_dicts)
    test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    task1_ndst_twin_2.evaluate_ndst(
        alphabet, alphabet_index, ensemble, feat_index, feat_input_dim,
        feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim,
        inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir,
        test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path,
        test_words, train_cluster_to_data_indices, train_path, train_words)
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'dev path =' + str(dev_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    if not eval_only:

        # align the words to the inflections, the alignment will later be used by the model
        print 'started aligning'
        train_word_pairs = zip(train_lemmas, train_words)
        dev_word_pairs = zip(dev_lemmas, dev_words)

        # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
        train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

        # TODO: align together?
        dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL)
        print 'finished aligning'

        last_epochs = []
        trained_model, last_epoch = train_model_wrapper(input_dim, hidden_dim, layers, train_lemmas, train_feat_dicts,
                                                        train_words, dev_lemmas, dev_feat_dicts, dev_words,
                                                        alphabet, alphabet_index, inverse_alphabet_index, epochs,
                                                        optimization, results_file_path, train_aligned_pairs,
                                                        dev_aligned_pairs,
                                                        feat_index, feature_types, feat_input_dim, feature_alphabet,
                                                        plot)

        # print when did each model stop
        print 'stopped on epoch {}'.format(last_epoch)

        with open(results_file_path + '.epochs', 'w') as f:
            f.writelines(last_epochs)

        print 'finished training all models'
    else:
        print 'skipped training by request. evaluating best models:'

    # eval on dev
    #~ print '=========DEV EVALUATION:========='
    #~ evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  #~ hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  #~ sigmorphon_root_dir, dev_feat_dicts, dev_lemmas, dev_path,
                  #~ dev_words, train_path)

    # eval on test
    print '=========TEST EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_path)

    return
Exemple #23
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization):

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers,
                     'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(
        train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(
        test_path)
    alphabet, feats = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas,
                                                                                     train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # cluster the data by inflection type (features)
    train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feats)
    test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feats)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for morph_index, morph_type in enumerate(train_morph_to_data_indices):

        # get the inflection-specific data
        train_morph_words = [train_words[i] for i in train_morph_to_data_indices[morph_type]]
        if len(train_morph_words) < 1:
            print 'only ' + str(len(train_morph_words)) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for morph ' + str(morph_index) + '/' + str(len(train_morph_to_data_indices)) + \
                  ': ' + morph_type + ' with ' + str(len(train_morph_words)) + ' examples'

        # test best model
        try:
            test_morph_lemmas = [test_lemmas[i] for i in test_morph_to_data_indices[morph_type]]
            test_morph_words = [test_words[i] for i in test_morph_to_data_indices[morph_type]]

            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(morph_index), alphabet,
                                                                    results_file_path, input_dim, hidden_dim, layers)

            predictions = task1_factored_inflection.predict(best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                                                            alphabet_index, inverse_alphabet_index, test_morph_lemmas,
                                                            test_morph_words)

            test_data = zip(test_morph_lemmas, test_morph_words)
            accuracy = task1_factored_inflection.evaluate_model(predictions, test_data)
            accuracies.append(accuracy)

            # get predictions in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_morph_to_data_indices[morph_type]:
                final_results[i] = (test_lemmas[i], predictions[test_lemmas[i]], morph_type)

        except KeyError:
            print 'could not find relevant examples in test data for morph: ' + morph_type

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals)/len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum([accuracies[i][0]*accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom/mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    task1_factored_inflection.write_results_file(hyper_params, macro_avg_accuracy, micro_average_accuracy, train_path,
                                                 test_path,
                                                 results_file_path + '.best', sigmorphon_root_dir, final_results)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization, feat_input_dim):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers,
                    'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(
        train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(
        test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # cluster the data by POS type (features)
    train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]]
        if len(train_cluster_words) < 1:
            print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        try:
            test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]]
            test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]]
            test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]]

            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet,
                                                                                  results_file_path, input_dim,
                                                                                  hidden_dim, layers,
                                                                                  feature_alphabet, feat_input_dim,
                                                                                  feature_types)

            predicted_templates = task1_joint_structured_inflection.predict_templates(best_model, decoder_rnn,
                                                                                      encoder_frnn, encoder_rrnn,
                                                                                      alphabet_index,
                                                                                      inverse_alphabet_index,
                                                                                      test_cluster_lemmas,
                                                                                      test_cluster_feat_dicts,
                                                                                      feat_index,
                                                                                      feature_types)

            accuracy = task1_joint_structured_inflection.evaluate_model(predicted_templates, test_cluster_lemmas,
                                                                        test_cluster_feat_dicts, test_cluster_words,
                                                                        feature_types, True)
            accuracies.append(accuracy)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
                inflection = task1_joint_structured_inflection.instantiate_template(predicted_templates[joint_index],
                                                                                    test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection)

        except KeyError:
            print 'could not find relevant examples in test data for cluster: ' + cluster_type

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'
    common.write_results_file(hyper_params, micro_average_accuracy, train_path,
                                              test_path, results_file_path + suffix, sigmorphon_root_dir,
                                              final_results)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3*MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)

    # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, ALIGN_SYMBOL)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words,
                       test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index,
                       feature_types, feat_input_dim, feature_alphabet, plot])

    if parallelize_training:

        # set maxtasksperchild=1 to free finished processes
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        models = p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        last_epochs = []
        for p in params:
            cluster_index = p[3]
            cluster_name = p[4]
            trained_model, last_epoch = train_cluster_model(*p)

            # print when did each model stop
            epoch_output = 'cluster {0} - {1} stopped on epoch {2}'.format(cluster_index, cluster_name,
                                                                           last_epoch)
            last_epochs.append(epoch_output)
            print epoch_output

        with open(results_file_path + '.epochs', 'w') as f:
            f.writelines(last_epochs)

    print 'finished training all models'

    # evaluate best models
    os.system('python task1_evaluate_best_nfst_models.py --cnn-mem 6096 --input={0} --hidden={1} \
    --feat-input={2} --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                      feat_input_dim, epochs,
                                                                                      layers, optimization, train_path,
                                                                                      test_path,
                                                                                      results_file_path,
                                                                                      sigmorphon_root_dir))
    for e in last_epochs:
        print 'last epoch is {}'.format(e)

    return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, override, eval_only, ensemble):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    model_file_name = results_file_path + '_bestmodel.txt'
    if os.path.isfile(model_file_name) and not override:
        print 'loading existing model from {}'.format(model_file_name)
        model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model(alphabet,
                                                                         results_file_path, input_dim,
                                                                         hidden_dim, layers, feature_alphabet,
                                                                         feat_input_dim, feature_types)
        print 'loaded existing model successfully'
    else:
        print 'could not find existing model or explicit override was requested. starting training from scratch...'
        model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model(alphabet, input_dim, hidden_dim, layers,
                                                                     feature_types, feat_input_dim, feature_alphabet)
    if not eval_only:
        # start training
        trained_model, last_epoch, best_epoch = train_model(model, encoder_frnn, encoder_rrnn, decoder_rnn,
                                                            train_lemmas, train_feat_dicts, train_words, dev_lemmas,
                                                            dev_feat_dicts, dev_words, alphabet_index,
                                                            inverse_alphabet_index, epochs, optimization,
                                                            results_file_path, feat_index, feature_types, plot)
        model = trained_model
        print 'last epoch is {}'.format(last_epoch)
        print 'best epoch is {}'.format(best_epoch)
        print 'finished training'
    else:
        print 'skipped training, evaluating on test set...'

    if ensemble:
        predicted_sequences = predict_with_ensemble_majority(alphabet, alphabet_index, ensemble, feat_index,
                                                             feat_input_dim, feature_alphabet, feature_types,
                                                             hidden_dim, input_dim, inverse_alphabet_index, layers,
                                                             test_feat_dicts, test_lemmas, test_words)
    else:
        predicted_sequences = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index,
                                                inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index,
                                                feature_types)
    if len(predicted_sequences) > 0:
        # evaluate last model on test
        amount, accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types,
                                          print_results=False)
        print 'initial eval: {}% accuracy'.format(accuracy)

        final_results = {}
        for i in xrange(len(test_lemmas)):
            joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
            inflection = predicted_sequences[joint_index]
            final_results[i] = (test_lemmas[i], test_feat_dicts[i], ''.join(inflection))

        # evaluate best models
        common.write_results_file_and_evaluate_externally(hyper_params, accuracy, train_path, test_path,
                                                          results_file_path + '.external_eval.txt', sigmorphon_root_dir,
                                                          final_results)
    return
Exemple #27
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization):
    parallelize_training = PARALLELIZE
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'FEAT_INPUT_DIM': feat_input_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'PATIENCE': MAX_PATIENCE,
        'REGULARIZATION': REGULARIZATION,
        'LEARNING_RATE': LEARNING_RATE
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # TODO: change build_model (done), train_model (in progress), predict (done), one word loss (done) etc. to take the
    # features in account

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):
        params.append([
            input_dim, hidden_dim, layers, cluster_index, cluster_type,
            train_lemmas, train_feat_dicts, train_words, test_lemmas,
            test_feat_dicts, train_cluster_to_data_indices, test_words,
            test_cluster_to_data_indices, alphabet, alphabet_index,
            inverse_alphabet_index, epochs, optimization, results_file_path,
            train_aligned_pairs, test_aligned_pairs, feat_index, feature_types,
            feat_input_dim, feature_alphabet
        ])

    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(
            len(train_cluster_to_data_indices))
        p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(
            len(train_cluster_to_data_indices))
        for p in params:
            train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system(
        'python task1_evaluate_best_joint_structured_models.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \
                 --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.
        format(input_dim, hidden_dim, feat_input_dim, epochs, layers,
               optimization, train_path, test_path, results_file_path,
               sigmorphon_root_dir))
    return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'dev path =' + str(dev_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)

    # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

    # TODO: align together?
    dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    dev_pos_to_data_indices = common.cluster_data_by_pos(dev_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    dev_cluster_to_data_indices = dev_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(dev_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # dev_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, dev_lemmas, dev_feat_dicts, train_cluster_to_data_indices, dev_words,
                       dev_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index,
                       feature_types, feat_input_dim, feature_alphabet, plot])

    if not eval_only:
        if parallelize_training:

            # set maxtasksperchild=1 to free finished processes
            p = Pool(4, maxtasksperchild=1)
            print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
            p.map(train_cluster_model_wrapper, params)
        else:
            print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
            last_epochs = []
            for p in params:
                cluster_index = p[3]
                cluster_name = p[4]
                trained_model, last_epoch = train_cluster_model(*p)

                # print when did each model stop
                epoch_output = 'cluster {0} - {1} stopped on epoch {2}'.format(cluster_index, cluster_name, last_epoch)
                last_epochs.append(epoch_output)
                print epoch_output

            with open(results_file_path + '.epochs', 'w') as f:
                f.writelines(last_epochs)

        print 'finished training all models'
    else:
        print 'skipped training by request. evaluating best models:'

    # eval on dev
    print '=========DEV EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, dev_cluster_to_data_indices, dev_feat_dicts, dev_lemmas, dev_path,
                  dev_words, train_cluster_to_data_indices, train_path, train_words)

    # eval on test
    print '=========TEST EVALUATION:========='
    test_cluster_to_data_indices = common.cluster_data_by_pos(dev_feat_dicts)
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_cluster_to_data_indices, train_path, train_words)

    return
def main(train_path, dev_path, test_path, results_path):
    # read morph input files (train+dev)
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    (dev_words, dev_lemmas,
     dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)

    merged_train_dev_lemmas = []
    merged_train_dev_words = []
    merged_train_dev_feat_dicts = []

    if dev_path != 'NONE':
        # merge the train and dev files, if dev exists
        merged_train_dev_lemmas += train_lemmas
        merged_train_dev_lemmas += dev_lemmas

        merged_train_dev_words += train_words
        merged_train_dev_words += dev_words

        merged_train_dev_feat_dicts += train_feat_dicts
        merged_train_dev_feat_dicts += dev_feat_dicts

    # TODO: optional - implement data augmentation

    # concatenate feats and characters for input
    tokenized_test_inputs, tokenized_test_outputs = convert_sigmorphon_to_MED_format(
        test_feat_dicts, test_lemmas, test_words)

    tokenized_train_inputs, tokenized_train_outputs = convert_sigmorphon_to_MED_format(
        train_feat_dicts, train_lemmas, train_words)

    tokenized_dev_inputs, tokenized_dev_outputs = convert_sigmorphon_to_MED_format(
        dev_feat_dicts, dev_lemmas, dev_words)

    tokenized_merged_inputs, tokenized_merged_outputs = convert_sigmorphon_to_MED_format(
        merged_train_dev_feat_dicts, merged_train_dev_lemmas,
        merged_train_dev_words)

    parallel_data = zip(tokenized_train_inputs, tokenized_train_outputs)

    # write input and output files
    train_inputs_file_path, train_outputs_file_path = write_converted_file(
        results_path, tokenized_train_inputs, tokenized_train_outputs,
        'train.in', 'train.out')

    train_inputs_file_path, train_outputs_file_path = write_converted_file(
        results_path, tokenized_train_inputs, tokenized_train_outputs,
        'train.in.tok', 'train.out.tok')

    test_inputs_file_path, test_outputs_file_path = write_converted_file(
        results_path, tokenized_test_inputs, tokenized_test_outputs, 'test.in',
        'test.out')

    test_inputs_file_path, test_outputs_file_path = write_converted_file(
        results_path, tokenized_test_inputs, tokenized_test_outputs,
        'test.in.tok', 'test.out.tok')

    merged_inputs_file_path, merged_outputs_file_path = write_converted_file(
        results_path, tokenized_merged_inputs, tokenized_merged_outputs,
        'merged.in', 'merged.out')

    merged_inputs_file_path, merged_outputs_file_path = write_converted_file(
        results_path, tokenized_merged_inputs, tokenized_merged_outputs,
        'merged.in.tok', 'merged.out.tok')

    dev_inputs_file_path, dev_outputs_file_path = write_converted_file(
        results_path, tokenized_dev_inputs, tokenized_dev_outputs, 'dev.in',
        'dev.out')

    dev_inputs_file_path, dev_outputs_file_path = write_converted_file(
        results_path, tokenized_dev_inputs, tokenized_dev_outputs,
        'dev.in.tok', 'dev.out.tok')

    # after the above files are created, hacky preprocess them by instantiating the args variables in prepare_data.py to
    # point the created files. only changes in original prepare_data.py code required for that are:

    # args.source = 'train.in'
    # args.target = 'train.out'
    # args.source_dev = 'test.in'
    # args.target_dev = 'test.out'

    # tr_files = ['/Users/roeeaharoni/GitHub/morphological-reinflection/src/machine_translation/data/train.in',
    #             '/Users/roeeaharoni/GitHub/morphological-reinflection/src/machine_translation/data/train.out']

    # change shuf to gshuf on mac

    # blocks search.py - line 102 - add on_unused_input='ignore'

    # eventually, run training script on the preprocessed files by changing those values in configuration.py:
    # bleu_val_freq, val_burn_in, val_set, val_set_grndtruth

    # and then run:
    # python -m machine_translation

    # finally run the script that converts the validation_out.txt file into the sigmorphon format and run evaluation
    sigmorphon_dev_file_path = dev_path
    MED_validation_file_path = './search_model_morph/validation_out.txt'
    output_file_path = './search_model_morph/validation_out.sigmorphon.txt'
    convert_MED_output_to_sigmorphon_format(sigmorphon_dev_file_path,
                                            MED_validation_file_path,
                                            output_file_path)

    return