def generate_lm (easy_config, training_filename) : command1 = ("export IRSTLM=" + easy_config.irstlm_path + "; " + easy_config.irstlm_path + "bin/build-lm.sh"\ + " -n 5"\ + " -i " + os.path.join(easy_config.easy_lm, training_filename + ".sb." + exp_config["target_id"])\ + " -t ./tmp -p -s improved-kneser-ney -o " + os.path.join(easy_config.easy_lm, training_filename + ".lm." + exp_config["target_id"])) write_step (command1, easy_config) os.system(command1)
def limiting_sentence_length (easy_config, training_filename) : command1 = easy_config.mosesdecoder_path + "scripts/training/clean-corpus-n.perl "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".true " + exp_config["source_id"] + " " + exp_config["target_id"])\ + " " + os.path.join(easy_config.easy_corpus, training_filename +".clean 1 "\ + exp_config["sentence_length"]) write_step (command1, easy_config) os.system(command1)
def limiting_sentence_length(easy_config, training_filename): command1 = easy_config.mosesdecoder_path + "scripts/training/clean-corpus-n.perl "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".true " + exp_config["source_id"] + " " + exp_config["target_id"])\ + " " + os.path.join(easy_config.easy_corpus, training_filename +".clean 1 "\ + exp_config["sentence_length"]) write_step(command1, easy_config) os.system(command1)
def t_filter_model_given_input(easy_config, path, filename): command1 = easy_config.mosesdecoder_path + "scripts/training/filter-model-given-input.pl "\ + " " + os.path.join(path, "filtered-" + filename)\ + " " + easy_config.easy_tuning + "/moses.ini "\ + " " + os.path.join(path,filename) write_step(command1, easy_config) os.system(command1)
def generate_lm(easy_config, training_filename): command1 = ("export IRSTLM=" + easy_config.irstlm_path + "; " + easy_config.irstlm_path + "bin/build-lm.sh"\ + " -n 5"\ + " -i " + os.path.join(easy_config.easy_lm, training_filename + ".sb." + exp_config["target_id"])\ + " -t ./tmp -p -s improved-kneser-ney -o " + os.path.join(easy_config.easy_lm, training_filename + ".lm." + exp_config["target_id"])) write_step(command1, easy_config) os.system(command1)
def t_filter_model_given_input (easy_config, path, filename) : command1 = easy_config.mosesdecoder_path + "scripts/training/filter-model-given-input.pl "\ + " " + os.path.join(path, "filtered-" + filename)\ + " " + easy_config.easy_tuning + "/moses.ini "\ + " " + os.path.join(path,filename) write_step (command1, easy_config) os.system(command1)
def shuff(easy_config, training_filename): command1 = "python " + easy_config.nmt_path + "preprocess/shuffle-hdf5.py " \ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["source_id"] + ".h5")\ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["target_id"] + ".h5")\ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["source_id"] + ".shuf.h5")\ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["target_id"] + ".shuf.h5") write_step (command1, easy_config) os.system(command1)
def truecaser(easy_config, training_filename): command1 = easy_config.mosesdecoder_path + "scripts/recaser/train-truecaser.perl --model "\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["source_id"]) + " --corpus "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["source_id"]) command2 = easy_config.mosesdecoder_path + "scripts/recaser/train-truecaser.perl --model "\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["target_id"]) + " --corpus "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def hdf5(easy_config, training_filename): command1 = "python " + easy_config.nmt_path + "preprocess/convert-pkl2hdf5.py " \ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["source_id"] + ".pkl")\ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["source_id"] + ".h5") command2 = "python " + easy_config.nmt_path + "preprocess/convert-pkl2hdf5.py " \ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["target_id"] + ".pkl")\ + " " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["target_id"] + ".h5") write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def truecaser (easy_config, training_filename) : command1 = easy_config.mosesdecoder_path + "scripts/recaser/train-truecaser.perl --model "\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["source_id"]) + " --corpus "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["source_id"]) command2 = easy_config.mosesdecoder_path + "scripts/recaser/train-truecaser.perl --model "\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["target_id"]) + " --corpus "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def prepare_neural_language_model(easy_config): command1 = (easy_config.nplm_path + "bin/prepareNeuralLM " + " --train_text " + easy_config.easy_nplm + training_filename + ".true." + exp_config["target_id"] + " --ngram_size 5 " + " --vocab_size 20000 " + " --write_words_file " + easy_config.easy_nplm + "words " + " --train_file " + easy_config.easy_nplm + "train.ngrams " + " --validation_size 500 " + " --validation_file " + easy_config.easy_nplm + "validation.ngrams " + " >& " + easy_config.easy_nplm + "prepareout.out &") write_step(command1, easy_config) os.system(command1)
def prepare_neural_language_model (easy_config) : command1 = (easy_config.nplm_path + "bin/prepareNeuralLM " + " --train_text " + easy_config.easy_nplm + training_filename + ".true." + exp_config["target_id"] + " --ngram_size 5 " + " --vocab_size 20000 " + " --write_words_file " + easy_config.easy_nplm + "words " + " --train_file " + easy_config.easy_nplm + "train.ngrams " + " --validation_size 500 " + " --validation_file " + easy_config.easy_nplm + "validation.ngrams " + " >& " + easy_config.easy_nplm + "prepareout.out &") write_step (command1, easy_config) os.system(command1)
def invert(easy_config, training_filename): print "----------- invert ------------" command1 = "python " + easy_config.nmt_path + "preprocess/invert-dict.py " \ + " " + os.path.join(easy_config.easy_corpus, "vocab." + exp_config["source_id"] + ".pkl")\ + " " + os.path.join(easy_config.easy_corpus, "ivocab." + exp_config["source_id"] + ".pkl") command2 = "python " + easy_config.nmt_path + "preprocess/invert-dict.py " \ + " " + os.path.join(easy_config.easy_corpus, "vocab." + exp_config["target_id"] + ".pkl")\ + " " + os.path.join(easy_config.easy_corpus, "ivocab." + exp_config["target_id"] + ".pkl") write_step (command1, easy_config) os.system (command1) write_step (command2, easy_config) os.system (command2)
def tuning_process (easy_config, devfilename) : command1 = "nohup nice " + easy_config.mosesdecoder_path + "scripts/training/mert-moses.pl "\ + "--decoder-flags=\"-threads "+exp_config["threads"]+"\""\ + " -threads " + exp_config["threads"]\ + " -maximum-iterations " + exp_config["tuning_max_iterations"]\ + " -working-dir " + easy_config.easy_tuning\ + " " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["source_id"])\ + " " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["target_id"])\ + " " + easy_config.mosesdecoder_path + "bin/moses_chart " + os.path.join(easy_config.easy_train,"model/moses.ini ")\ + " --mertdir " + easy_config.mosesdecoder_path + "bin/ &> " + os.path.join(easy_config.easy_tuning, "mert.out") + " &" write_step (command1, easy_config) os.system(command1)
def tuning_process(easy_config, devfilename): command1 = "nohup nice " + easy_config.mosesdecoder_path + "scripts/training/mert-moses.pl "\ + "--decoder-flags=\"-threads "+exp_config["threads"]+"\""\ + " -threads " + exp_config["threads"]\ + " -maximum-iterations " + exp_config["tuning_max_iterations"]\ + " -working-dir " + easy_config.easy_tuning\ + " " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["source_id"])\ + " " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["target_id"])\ + " " + easy_config.mosesdecoder_path + "bin/moses_chart " + os.path.join(easy_config.easy_train,"model/moses.ini ")\ + " --mertdir " + easy_config.mosesdecoder_path + "bin/ &> " + os.path.join(easy_config.easy_tuning, "mert.out") + " &" write_step(command1, easy_config) os.system(command1)
def extract_training (easy_config, training_filename) : command1 = easy_config.mosesdecoder_path + "scripts/training/bilingual-lm/extract_training.py"\ + " --working-dir " + easy_config.easy_bnplm\ + " --corpus " + os.path.join(easy_config.easy_corpus, training_filename + ".clean")\ + " --source-language " + exp_config["source_id"]\ + " --target-language " + exp_config["target_id"]\ + " --align " + os.path.join(easy_config.easy_train, "model/aligned.grow-diag-final-and")\ + " --prune-target-vocab " + exp_config["target_vocb"]\ + " --prune-source-vocab " + exp_config["source_vocb"]\ + " --target-context " + exp_config["bnplm_target_context"]\ + " --source-context " + exp_config["bnplm_source_context"] write_step (command1, easy_config) os.system(command1)
def tuning_truecase (easy_config, devfilename) : command1 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["source_id"])\ + " < " + os.path.join(easy_config.easy_tuning, devfilename + ".tok." + exp_config["source_id"])\ + " > " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["source_id"]) command2 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_tuning, devfilename + ".tok." + exp_config["target_id"])\ + " > " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def tuning_truecase(easy_config, devfilename): command1 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["source_id"])\ + " < " + os.path.join(easy_config.easy_tuning, devfilename + ".tok." + exp_config["source_id"])\ + " > " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["source_id"]) command2 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_tuning, devfilename + ".tok." + exp_config["target_id"])\ + " > " + os.path.join(easy_config.easy_tuning, devfilename + ".true." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def extract_training(easy_config, training_filename): command1 = easy_config.mosesdecoder_path + "scripts/training/bilingual-lm/extract_training.py"\ + " --working-dir " + easy_config.easy_bnplm\ + " --corpus " + os.path.join(easy_config.easy_corpus, training_filename + ".clean")\ + " --source-language " + exp_config["source_id"]\ + " --target-language " + exp_config["target_id"]\ + " --align " + os.path.join(easy_config.easy_train, "model/aligned.grow-diag-final-and")\ + " --prune-target-vocab " + exp_config["target_vocb"]\ + " --prune-source-vocab " + exp_config["source_vocb"]\ + " --target-context " + exp_config["bnplm_target_context"]\ + " --source-context " + exp_config["bnplm_source_context"] write_step(command1, easy_config) os.system(command1)
def train_neural_network(easy_config): command1 = (easy_config.nplm_path + "bin/trainNeuralNetwork " + " --train_file " + easy_config.easy_nplm + "train.ngrams " + " --validation_file " + easy_config.easy_nplm + "validation.ngrams " + " --num_epochs 30 " + " --input_words_file " + easy_config.easy_nplm + "words " + " --model_prefix " + easy_config.easy_nplm + "model " + " --input_embedding_dimension 150 " + " --num_hidden 0" + " --output_embedding_dimension 750 " + " --num_threads " + exp_config["threads"] + " >& " + easy_config.easy_nplm + "nplmtrain.out &") write_step(command1, easy_config) os.system(command1)
def translation_model(easy_config, training_filename): command1 = "nohup nice " + easy_config.mosesdecoder_path + "scripts/training/train-model.perl "\ + " -mgiza -mgiza-cpus " + exp_config["threads"] + " -cores 2 "\ + " -root-dir " + easy_config.easy_train\ + " -corpus " + " " + os.path.join(easy_config.easy_corpus, training_filename + ".clean")\ + " -f " + exp_config["source_id"] + " -e " + exp_config["target_id"]\ + " -alignment grow-diag-final-and "\ + " " + exp_config["phrase"]\ + " msd-bidirectional-fe -lm 0:"+exp_config["n-gram"]+":"\ + os.path.join(easy_config.easy_lm, training_filename + ".blm." + exp_config["target_id"]) + ":8 "\ + " -external-bin-dir " + easy_config.giza_path + "bin"\ + " >& " + os.path.join(easy_config.easy_train, "training.out") + " &" write_step(command1, easy_config) os.system(command1)
def translation_model (easy_config, training_filename) : command1 = "nohup nice " + easy_config.mosesdecoder_path + "scripts/training/train-model.perl "\ + " -mgiza -mgiza-cpus " + exp_config["threads"] + " -cores 2 "\ + " -root-dir " + easy_config.easy_train\ + " -corpus " + " " + os.path.join(easy_config.easy_corpus, training_filename + ".clean")\ + " -f " + exp_config["source_id"] + " -e " + exp_config["target_id"]\ + " -alignment grow-diag-final-and "\ + " " + exp_config["phrase"]\ + " msd-bidirectional-fe -lm 0:"+exp_config["n-gram"]+":"\ + os.path.join(easy_config.easy_lm, training_filename + ".blm." + exp_config["target_id"]) + ":8 "\ + " -external-bin-dir " + easy_config.giza_path + "bin"\ + " >& " + os.path.join(easy_config.easy_train, "training.out") + " &" write_step (command1, easy_config) os.system(command1)
def t_truecasing (easy_config, testfilename) : command1 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["source_id"])\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".tok." + exp_config["source_id"])\ + " > " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["source_id"])\ # + " > " + easy_config.easy_evaluation + testfilename + ".translated.true." + exp_config["target_id"]) command2 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".tok." + exp_config["target_id"])\ + " > " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def prepare_corpus (easy_config) : command1 = (easy_config.mosesdecoder_path + "scripts/tokenizer/tokenizer.perl -l " + exp_config["target_id"] + " -threads " + exp_config["threads"] + " -no-escape 1 " + " < " + exp_config["training_corpus"] + training_filename + "." + exp_config["target_id"] + " > " + easy_config.easy_nplm + training_filename + ".tok." + exp_config["target_id"]) command2 = (easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model " + " " + easy_config.easy_truecaser + "truecase-model." + exp_config["target_id"] + " < " + easy_config.easy_nplm + training_filename + ".tok." + exp_config["target_id"] + " > " + easy_config.easy_nplm + training_filename + ".true." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def train_neural_network (easy_config) : command1 = (easy_config.nplm_path + "bin/trainNeuralNetwork " + " --train_file " + easy_config.easy_nplm + "train.ngrams " + " --validation_file " + easy_config.easy_nplm + "validation.ngrams " + " --num_epochs 30 " + " --input_words_file " + easy_config.easy_nplm + "words " + " --model_prefix " + easy_config.easy_nplm + "model " + " --input_embedding_dimension 150 " + " --num_hidden 0" + " --output_embedding_dimension 750 " + " --num_threads "+ exp_config["threads"] + " >& " + easy_config.easy_nplm + "nplmtrain.out &") write_step (command1, easy_config) os.system(command1)
def tokenisation (easy_config, training_filename) : command1 = easy_config.mosesdecoder_path + "scripts/tokenizer/tokenizer.perl -l " + exp_config["source_id"]\ + " -threads " + exp_config["threads"]\ + " -no-escape 1 "\ + " < " + exp_config["training_corpus"] + training_filename + "." + exp_config["source_id"] + " > "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["source_id"]) command2 = easy_config.mosesdecoder_path + "scripts/tokenizer/tokenizer.perl -l " + exp_config["target_id"]\ + " -threads " + exp_config["threads"]\ + " -no-escape 1 "\ + " < " + exp_config["training_corpus"] + training_filename + "." + exp_config["target_id"] + " > "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def t_truecasing(easy_config, testfilename): command1 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["source_id"])\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".tok." + exp_config["source_id"])\ + " > " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["source_id"])\ # + " > " + easy_config.easy_evaluation + testfilename + ".translated.true." + exp_config["target_id"]) command2 = easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model"\ + " " + os.path.join(easy_config.easy_truecaser, "truecase-model." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".tok." + exp_config["target_id"])\ + " > " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def run_test(easy_config, testfilename): command1 = "nohup nice " + easy_config.mosesdecoder_path + "bin/moses "\ + " -threads "+exp_config["threads"]\ + " -f " + os.path.join(easy_config.easy_evaluation, "filtered-" +testfilename+ ".true." + exp_config["source_id"] + "/moses.ini ")\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["source_id"])\ + " > " + os.path.join(easy_config.easy_evaluation, testfilename + ".translated." + exp_config["target_id"])\ + " 2> " + os.path.join(easy_config.easy_evaluation, testfilename + ".out") + " " command2 = easy_config.mosesdecoder_path + "scripts/generic/multi-bleu.perl "\ + " -lc " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".translated." + exp_config["target_id"]) # + " < " + easy_config.easy_evaluation + testfilename + ".translated." + exp_config["target_id"] + ".9" write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def run_test (easy_config, testfilename) : command1 = "nohup nice " + easy_config.mosesdecoder_path + "bin/moses "\ + " -threads "+exp_config["threads"]\ + " -f " + os.path.join(easy_config.easy_evaluation, "filtered-" +testfilename+ ".true." + exp_config["source_id"] + "/moses.ini ")\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["source_id"])\ + " > " + os.path.join(easy_config.easy_evaluation, testfilename + ".translated." + exp_config["target_id"])\ + " 2> " + os.path.join(easy_config.easy_evaluation, testfilename + ".out") + " " command2 = easy_config.mosesdecoder_path + "scripts/generic/multi-bleu.perl "\ + " -lc " + os.path.join(easy_config.easy_evaluation, testfilename + ".true." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_evaluation, testfilename + ".translated." + exp_config["target_id"]) # + " < " + easy_config.easy_evaluation + testfilename + ".translated." + exp_config["target_id"] + ".9" write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def train_nplm (easy_config, training_filename) : command1 = easy_config.mosesdecoder_path + "scripts/training/bilingual-lm/train_nplm.py"\ + " --working-dir " + easy_config.easy_bnplm\ + " --corpus " + os.path.join(easy_config.easy_corpus, training_filename + ".clean ")\ + " --nplm-home " + easy_config.nplm_path\ + " --ngram-size " + exp_config["bnplm_ngram"]\ + " --epochs " + exp_config["bnplm_epochs"]\ + " --learning-rate " + exp_config["bnplm_learning_rate"]\ + " --hidden "+ exp_config["bnplm_hidden"]\ + " --input-embedding "+ exp_config["bnplm_input_embedding"]\ + " --output-embedding " + exp_config["bnplm_hidden"]\ + " --threads " + exp_config["threads"]\ + " &> " + os.path.join(easy_config.easy_bnplm, "nplm.out") + " &" write_step (command1, easy_config) os.system(command1)
def tokenisation(easy_config, training_filename): command1 = easy_config.mosesdecoder_path + "scripts/tokenizer/tokenizer.perl -l " + exp_config["source_id"]\ + " -threads " + exp_config["threads"]\ + " -no-escape 1 "\ + " < " + exp_config["training_corpus"] + training_filename + "." + exp_config["source_id"] + " > "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["source_id"]) command2 = easy_config.mosesdecoder_path + "scripts/tokenizer/tokenizer.perl -l " + exp_config["target_id"]\ + " -threads " + exp_config["threads"]\ + " -no-escape 1 "\ + " < " + exp_config["training_corpus"] + training_filename + "." + exp_config["target_id"] + " > "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".tok." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def train_nplm(easy_config, training_filename): command1 = easy_config.mosesdecoder_path + "scripts/training/bilingual-lm/train_nplm.py"\ + " --working-dir " + easy_config.easy_bnplm\ + " --corpus " + os.path.join(easy_config.easy_corpus, training_filename + ".clean ")\ + " --nplm-home " + easy_config.nplm_path\ + " --ngram-size " + exp_config["bnplm_ngram"]\ + " --epochs " + exp_config["bnplm_epochs"]\ + " --learning-rate " + exp_config["bnplm_learning_rate"]\ + " --hidden "+ exp_config["bnplm_hidden"]\ + " --input-embedding "+ exp_config["bnplm_input_embedding"]\ + " --output-embedding " + exp_config["bnplm_hidden"]\ + " --threads " + exp_config["threads"]\ + " &> " + os.path.join(easy_config.easy_bnplm, "nplm.out") + " &" write_step(command1, easy_config) os.system(command1)
def test_on_train (easy_config) : filename = "OF.clean." + exp_config["source_id"] t_filter_model_given_input(easy_config, easy_config.easy_overfitting, filename) command1 = "nohup nice " + easy_config.mosesdecoder_path + "bin/moses "\ + " -threads " + exp_config["threads"]\ + " -f " + os.path.join(easy_config.easy_overfitting, "filtered-"+filename+"/moses.ini ")\ + " < " + os.path.join(easy_config.easy_overfitting, filename)\ + " > " + os.path.join(easy_config.easy_overfitting, "OF.translated." + exp_config["target_id"])\ + " 2> " + os.path.join(easy_config.easy_overfitting, "OF.clean.out") + " " command2 = easy_config.mosesdecoder_path + "scripts/generic/multi-bleu.perl "\ + " -lc " + os.path.join(easy_config.easy_overfitting, "OF.clean." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_overfitting, "OF.translated." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1) write_step (command2, easy_config) os.system(command2)
def pkl (easy_config, training_filename): command1 = "python " + easy_config.nmt_path + "preprocess/preprocess.py "\ + os.path.join(easy_config.easy_corpus, training_filename + ".clean." + exp_config["source_id"])\ + " -d " + os.path.join(easy_config.easy_corpus, "vocab." + exp_config["source_id"] + ".pkl")\ + " -v " + exp_config["source_vocb"]\ + " -b " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["source_id"] + ".pkl")\ + " -p " #+ os.path.join(easy_config.easy_corpus, "*"+exp_config["source_id"]+".txt.gz" command2 = "python " + easy_config.nmt_path + "preprocess/preprocess.py " \ + os.path.join(easy_config.easy_corpus, training_filename + ".clean." + exp_config["target_id"])\ + " -d " + os.path.join(easy_config.easy_corpus, "vocab." + exp_config["target_id"] + ".pkl")\ + " -v " + exp_config["target_vocb"]\ + " -b " + os.path.join(easy_config.easy_corpus, "binarized_text." + exp_config["target_id"] + ".pkl")\ + " -p " #+ os.path.join(easy_config.easy_corpus, "*"+exp_config["source_id"]+".txt.gz" write_step (command1, easy_config) os.system (command1) write_step (command2, easy_config) os.system (command2)
def test_on_train(easy_config): filename = "OF.clean." + exp_config["source_id"] t_filter_model_given_input(easy_config, easy_config.easy_overfitting, filename) command1 = "nohup nice " + easy_config.mosesdecoder_path + "bin/moses "\ + " -threads " + exp_config["threads"]\ + " -f " + os.path.join(easy_config.easy_overfitting, "filtered-"+filename+"/moses.ini ")\ + " < " + os.path.join(easy_config.easy_overfitting, filename)\ + " > " + os.path.join(easy_config.easy_overfitting, "OF.translated." + exp_config["target_id"])\ + " 2> " + os.path.join(easy_config.easy_overfitting, "OF.clean.out") + " " command2 = easy_config.mosesdecoder_path + "scripts/generic/multi-bleu.perl "\ + " -lc " + os.path.join(easy_config.easy_overfitting, "OF.clean." + exp_config["target_id"])\ + " < " + os.path.join(easy_config.easy_overfitting, "OF.translated." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def prepare_corpus(easy_config): command1 = (easy_config.mosesdecoder_path + "scripts/tokenizer/tokenizer.perl -l " + exp_config["target_id"] + " -threads " + exp_config["threads"] + " -no-escape 1 " + " < " + exp_config["training_corpus"] + training_filename + "." + exp_config["target_id"] + " > " + easy_config.easy_nplm + training_filename + ".tok." + exp_config["target_id"]) command2 = (easy_config.mosesdecoder_path + "scripts/recaser/truecase.perl --model " + " " + easy_config.easy_truecaser + "truecase-model." + exp_config["target_id"] + " < " + easy_config.easy_nplm + training_filename + ".tok." + exp_config["target_id"] + " > " + easy_config.easy_nplm + training_filename + ".true." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1) write_step(command2, easy_config) os.system(command2)
def cross_corpus(id1, mt_type, tag, easy_config): command1 = "python " + easy_config.nmt_path + "sample.py"\ + " --beam-search "\ + " --beam-size 12"\ + " --state " + easy_config.easy_workspace + "nmt/" + id1 + "/" + "search_state.pkl "\ + " --source " + easy_config.easy_evaluation + testfilename + ".true." + exp_config["source_id"]\ + " --trans " + easy_config.easy_evaluation + testfilename + ".translated." + id1 + "." + exp_config["target_id"]\ + " " + easy_config.easy_workspace + "nmt/" + id1 + "/" + "search_model.npz"\ + " >& " + easy_config.easy_evaluation + id1 + "_out.txt &" command2 = "nohup nice " + easy_config.mosesdecoder_path + "bin/moses "\ + " -threads " + exp_config["threads"]\ + " -f " + easy_config.easy_workspace + "tuning/" + id1 + "/"+ "moses.ini "\ + " < " + easy_config.easy_evaluation + testfilename + ".true." + exp_config["source_id"]\ + " > " + easy_config.easy_evaluation + testfilename + ".translated." + id1 + "." + exp_config["target_id"]\ + " 2> " + easy_config.easy_evaluation + id1 + "_out.txt" command3 = easy_config.mosesdecoder_path + "scripts/generic/multi-bleu.perl "\ + " -lc " + easy_config.easy_evaluation + testfilename + ".true." + exp_config["target_id"]\ + " < " + easy_config.easy_evaluation + testfilename + ".translated." + id1 + "." + exp_config["target_id"] if tag == "tr" and mt_type == "nmt": write_step(command1, easy_config) os.system(command1) elif tag == "tr" and mt_type == "smt": write_step(command2, easy_config) os.system(command2) if Tag == "te": write_step(comMand3) os.system(command3)
def generate_sb (easy_config, training_filename) : command1 = easy_config.irstlm_path + "bin/add-start-end.sh < "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".true." + exp_config["target_id"])\ + " > " + os.path.join(easy_config.easy_lm, training_filename + ".sb." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1)
def generate_blm (easy_config, training_filename) : command1 = easy_config.mosesdecoder_path + "bin/build_binary -i "\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".arpa." + exp_config["target_id"])\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".blm." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1)
def generate_blm(easy_config, training_filename): command1 = easy_config.mosesdecoder_path + "bin/build_binary -i "\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".arpa." + exp_config["target_id"])\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".blm." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1)
def generate_sb(easy_config, training_filename): command1 = easy_config.irstlm_path + "bin/add-start-end.sh < "\ + " " + os.path.join(easy_config.easy_corpus, training_filename + ".true." + exp_config["target_id"])\ + " > " + os.path.join(easy_config.easy_lm, training_filename + ".sb." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1)
def generate_arpa(easy_config, training_filename): command1 = easy_config.irstlm_path + "bin/compile-lm --text=yes "\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".lm." + exp_config["target_id"] + ".gz")\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".arpa." + exp_config["target_id"]) write_step(command1, easy_config) os.system(command1)
def overfitting_prepare(easy_config, training_filename, sampling_base = 30): easycorpus.sampling_file(os.path.join(easy_config.easy_corpus, training_filename+".clean."+exp_config["source_id"]), easy_config.easy_overfitting+"/OF.clean."+exp_config["source_id"], sampling_base) easycorpus.sampling_file(os.path.join(easy_config.easy_corpus, training_filename+".clean."+exp_config["target_id"]), easy_config.easy_overfitting+"/OF.clean."+exp_config["target_id"], sampling_base) write_step("overfitting_prepare", easy_config)
def generate_arpa (easy_config, training_filename) : command1 = easy_config.irstlm_path + "bin/compile-lm --text=yes "\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".lm." + exp_config["target_id"] + ".gz")\ + " " + os.path.join(easy_config.easy_lm, training_filename + ".arpa." + exp_config["target_id"]) write_step (command1, easy_config) os.system(command1)