def filter_babel_gum(target, source, env): with meta_open(source[0].rstr()) as pron_ifd, meta_open(source[1].rstr()) as prob_ifd, meta_open(source[2].rstr()) as lim_ifd: pron = Pronunciations(pron_ifd) logging.info("Old pronunciations: %s", pron) prob = ProbabilityList(prob_ifd) logging.info("Old probabilities: %s", prob) filt = Vocabulary(lim_ifd) logging.info("Correct words: %s", filt) pron.filter_by(filt) logging.info("New pronunciations: %s", pron) prob.filter_by(filt) logging.info("New probabilities: %s", prob) with meta_open(target[0].rstr(), "w") as pron_ofd, meta_open(target[1].rstr(), "w") as prob_ofd: pron_ofd.write(pron.format()) prob_ofd.write(prob.format()) return None
def augment_language_model(target, source, env): """ Input: old language model, old pronunciations, new pronunciations Output: new language model, new vocab, new pronunciations """ from arpabo import Arpabo, Pronunciations if len(source) == 4: old_prons = Pronunciations(meta_open(source[0].rstr())) old_lm = Arpabo(meta_open(source[1].rstr())) new_prons = Pronunciations(meta_open(source[2].rstr())) mass = source[3].read() elif len(source) == 5: old_prons = Pronunciations(meta_open(source[0].rstr())) old_lm = Arpabo(meta_open(source[1].rstr())) new_prons = Pronunciations(meta_open(source[2].rstr())) new_probs = arpabo.ProbabilityList(meta_open(source[3].rstr())) mass = source[4].read() logging.info("Old LM: %s", old_lm) logging.info("Old Pronunciations: %s", old_prons) logging.info("Words to add: %s", new_prons) old_prons.add_entries(new_prons) if len(source) == 4: old_lm.add_unigrams(new_prons.get_words(), mass) else: old_lm.add_unigrams_with_probs(new_probs, mass) logging.info("New Pronunciations: %s", old_prons) logging.info("New LM: %s", old_lm) logging.info("New words have weight %s", old_lm.get_probability_of_words(new_prons.get_words())) logging.info("Old words have weight %s", old_lm.get_probability_of_not_words(new_prons.get_words())) with meta_open(target[0].rstr(), "w") as new_vocab, meta_open(target[1].rstr(), "w") as new_prons, meta_open(target[2].rstr(), "w") as new_lm: new_lm.write(old_lm.format()) new_vocab.write(old_prons.format_vocabulary()) new_prons.write(old_prons.format()) return None