def filter_words(target, source, env): """ Takes a coherent language model, pronunciation file and vocabulary file, and a second vocabulary file, and returns a coherent language model, pronunciation file and vocabulary file limited to the words in the second vocabulary file. The language model probabilities are scaled such that unigrams sum to one. *** """ with meta_open(source[0].rstr()) as voc_fd, meta_open(source[1].rstr()) as pron_fd, meta_open(source[2].rstr()) as lm_fd, meta_open(source[3].rstr()) as lim_fd: lm = Arpabo(lm_fd) pron = Pronunciations(pron_fd) voc = Vocabulary(voc_fd) lim = Vocabulary(lim_fd) logging.info("Original vocabulary: %s", voc) logging.info("Original pronunciations: %s", pron) logging.info("Original LM: %s", lm) logging.info("Limiting vocabulary: %s", lim) logging.info("Vocabulary to remove has mass: %s", lm.get_probability_of_not_words(lim.get_words())) logging.info("Vocabulary to remain has mass: %s", lm.get_probability_of_words(lim.get_words())) lm.filter_by(lim) pron.filter_by(lim) voc.filter_by(lim) logging.info("New vocabulary: %s", voc) logging.info("New pronunciations: %s", pron) logging.info("New LM: %s", lm) with meta_open(target[0].rstr(), "w") as voc_ofd, meta_open(target[1].rstr(), "w") as pron_ofd, meta_open(target[2].rstr(), "w") as lm_ofd: voc_ofd.write(voc.format()) pron_ofd.write(pron.format()) lm_ofd.write(lm.format()) return None
def top_words(target, source, env): args = source[-1].read() with meta_open(source[0].rstr()) as words_ifd, meta_open(source[1].rstr()) as pron_ifd: top = ProbabilityList(words_ifd).get_top_n(args["COUNT"]) prons = Pronunciations(pron_ifd) prons.filter_by(top) with meta_open(target[0].rstr(), "w") as words_ofd, meta_open(target[1].rstr(), "w") as pron_ofd: words_ofd.write(top.format()) pron_ofd.write(prons.format()) return None
def filter_babel_gum(target, source, env): with meta_open(source[0].rstr()) as pron_ifd, meta_open(source[1].rstr()) as prob_ifd, meta_open(source[2].rstr()) as lim_ifd: pron = Pronunciations(pron_ifd) logging.info("Old pronunciations: %s", pron) prob = ProbabilityList(prob_ifd) logging.info("Old probabilities: %s", prob) filt = Vocabulary(lim_ifd) logging.info("Correct words: %s", filt) pron.filter_by(filt) logging.info("New pronunciations: %s", pron) prob.filter_by(filt) logging.info("New probabilities: %s", prob) with meta_open(target[0].rstr(), "w") as pron_ofd, meta_open(target[1].rstr(), "w") as prob_ofd: pron_ofd.write(pron.format()) prob_ofd.write(prob.format()) return None