예제 #1
0
def smtp_pipeline(config):
    path_to_moses = config.get("Environment Settings", "path_to_moses_decoder")
    mem_limit = config.getint("Environment Settings", "mem_limit")
    max_len = config.getint("Iteration Settings", "max_sentence_len")
    min_len = config.getint("Iteration Settings", "min_sentence_len")

    srcf = utilities.safe_string(config.get("Iteration Settings", "src_lang_data"))
    piv1f = utilities.safe_string(config.get("Iteration Settings", "src_piv_lang_data"))
    piv2f = utilities.safe_string(config.get("Iteration Settings", "piv_tar_lang_data"))
    tarf = utilities.safe_string(config.get("Iteration Settings", "tar_lang_data"))
    train = config.getfloat("Iteration Settings", "train_split")
    test = config.getfloat("Iteration Settings", "test_split")
    ncpus = config.getint("Environment Settings", "ncpus")
    ngram = config.getint("Environment Settings", "ngram")
    work_dir1 = utilities.safe_string(config.get("Iteration Settings", "working_dir_first_leg"))
    work_dir2 = utilities.safe_string(config.get("Iteration Settings", "working_dir_second_leg"))

    pair1, pair2 = FileDataPair(srcf, piv1f), FileDataPair(piv2f, tarf)
    raw_files = pair1.get_raw_filenames() + pair2.get_raw_filenames()
    pair1_tokenized_src, pair1_tokenized_tar = pair1.get_tokenized_filenames()
    pair2_tokenized_src, pair2_tokenized_tar = pair2.get_tokenized_filenames()
    pair1_cleansed_src, pair1_cleansed_tar = pair1.get_cleansed_filenames()
    pair2_cleansed_src, pair2_cleansed_tar = pair2.get_cleansed_filenames()

    parser = Parser(path_to_moses, mem_limit, max_len, min_len, False)
    parser.tokenize_files(raw_files)
    parser.cleanse(pair1_tokenized_src, pair1_tokenized_tar)
    parser.cleanse(pair2_tokenized_src, pair2_tokenized_tar)
    parser.split_train_tune_test(pair1_cleansed_src, pair1_cleansed_tar, \
        pair2_cleansed_src, pair2_cleansed_tar, train, test)
    parser.match(pair1_test_src, pair2_test_tar, pair2_test_src, pair2_test_tar)

    pair1_target_train_filename = pair1.get_target_train_filename()
    pair2_target_train_filename = pair2.get_target_train_filename()
    pair1_train_src, pair1_train_tar = pair1.get_train_filenames()
    pair2_train_src, pair2_train_tar = pair2.get_train_filenames()

    trainer = Train(path_to_moses, ncpus, ngram, False)
    trainer.build_language_models(pair1_target_train_filename)
    trainer.build_language_models(pair2_target_train_filename)
    trainer.train(pair1_train_src, pair1_train_tar, work_dir1)
    trainer.train(pair2_train_src, pair2_train_tar, work_dir2)

    pair1_tune_src, pair1_tune_tar = pair1.get_tune_filenames()
    pair2_tune_src, pair2_tune_tar = pair2.get_tune_filenames()

    tuner = Tune(path_to_moses, ncpus, False)
    tuner.tune(pair1_tune_src, pair1_tune_tar, work_dir1)
    tuner.tune(pair2_tune_src, pair2_tune_tar, work_dir2)

    pair1_test_src, pair1_test_tar = pair1.get_test_filenames()
    pair2_test_src, pair2_test_tar = pair2.get_test_filenames()
    pair1_test_tar = pair1.get_eval_filename()
    pair2_test_tar = pair2.get_eval_filename()

    test = Test(path_to_moses, False)
    test.test_pivoting_quality(pair1_test_tar, work_dir1,
        pair2_test_tar, work_dir2)
def main():
    parser = Parser(True)

    # Tokenize the data
    parser.tokenize("src/europarl-v7.es-en.es")
    parser.tokenize("src/europarl-v7.es-en.en")
    parser.tokenize("src/europarl-v7.fr-en.en")
    parser.tokenize("src/europarl-v7.fr-en.fr")

    # Normalize the data
    parser.cleanse("data/europarl-v7.es-en.es.tok", "data/europarl-v7.es-en.en.tok")
    parser.cleanse("data/europarl-v7.fr-en.en.tok", "data/europarl-v7.fr-en.fr.tok")

    # Split data into train, tune, test sets
    parser.split_train_tune_test("data/europarl-v7.es-en.es.tok.cleansed", "data/europarl-v7.es-en.en.tok.cleansed",
        "data/europarl-v7.fr-en.en.tok.cleansed", "data/europarl-v7.fr-en.fr.tok.cleansed", .6, .2)

    parser.match("data/test/europarl-v7.es-en.es.tok.cleansed.test", "data/test/europarl-v7.es-en.en.tok.cleansed.test",
        "data/test/europarl-v7.fr-en.en.tok.cleansed.test", "data/test/europarl-v7.fr-en.fr.tok.cleansed.test")

    trainer = Train(True)
    # Build target language models
    trainer.build_language_models("data/train/europarl-v7.es-en.en.tok.cleansed.train")
    trainer.build_language_models("data/train/europarl-v7.fr-en.fr.tok.cleansed.train")

    # Train each leg of the translation system
    trainer.train("data/train/europarl-v7.es-en.es.tok.cleansed.train",
        "data/train/europarl-v7.es-en.en.tok.cleansed.train", "es-en.working")
    trainer.train("data/train/europarl-v7.fr-en.en.tok.cleansed.train",
        "data/train/europarl-v7.fr-en.fr.tok.cleansed.train", "en-fr.working")

    # Tune the system on held out data
    tuner = Tune(True)
    tuner.tune("data/tune/europarl-v7.es-en.es.tok.cleansed.tune",
        "data/tune/europarl-v7.es-en.en.tok.cleansed.tune", "es-en.working")
    tuner.tune("data/tune/europarl-v7.fr-en.en.tok.cleansed.tune",
        "data/tune/europarl-v7.fr-en.fr.tok.cleansed.tune", "en-fr.working")

    test = Test(True)
    # Run interactive translator server
    test.test_translator_interactive("es-en.working")
    test.test_translator_interactive("en-fr.working")

    # Score translation quality between pivot translations using held out test data
    test.test_translation_quality("data/test/europarl-v7.es-en.es.tok.cleansed.test",
        "data/test/europarl-v7.es-en.en.tok.cleansed.test", "es-en.working")
    test.test_translation_quality("data/test/europarl-v7.fr-en.en.tok.cleansed.test",
        "data/test/europarl-v7.fr-en.fr.tok.cleansed.test", "en-fr.working")
    # Run interactive translator on pivoting system
    test.test_pivoting_interactive("es-en.working", "en-fr.working")

    # Score translation quality on entire translation using matched test data
    test.test_pivoting_quality("data/test/europarl-v7.es-en.es.tok.cleansed.test.matched",
        "es-en.working", "data/test/europarl-v7.fr-en.fr.tok.cleansed.test.matched", "en-fr.working")
예제 #3
0
def main():

    # Create Train instance, set verbose to True to see whats happening
    trainer = Train(True)

    # Build target language models for only the target languages. In this
    # scenario, the desired target languages are the pivot language in the
    # source to pivot leg of the translation and the target language in the
    # pivot to target leg of the scenario. Language models are saved in the
    # lm directory
    trainer.build_language_models("data/train/europarl-v7.es-en.en.tok.cleansed.train")
    trainer.build_language_models("data/train/europarl-v7.fr-en.fr.tok.cleansed.train")

    # Train each leg of the translation system seperately. The first
    # parameter must be the path to the source training data, the second
    # will be the path to the pivot training data, and the third is the
    # name for the directory which will store the system's results.
    trainer.train("data/train/europarl-v7.es-en.es.tok.cleansed.train",
        "data/train/europarl-v7.es-en.en.tok.cleansed.train", "es-en.working")
    trainer.train("data/train/europarl-v7.fr-en.en.tok.cleansed.train",
        "data/train/europarl-v7.fr-en.fr.tok.cleansed.train", "en-fr.working")