def filter_words(target, source, env): """ Takes a coherent language model, pronunciation file and vocabulary file, and a second vocabulary file, and returns a coherent language model, pronunciation file and vocabulary file limited to the words in the second vocabulary file. The language model probabilities are scaled such that unigrams sum to one. *** """ with meta_open(source[0].rstr()) as voc_fd, meta_open(source[1].rstr()) as pron_fd, meta_open(source[2].rstr()) as lm_fd, meta_open(source[3].rstr()) as lim_fd: lm = Arpabo(lm_fd) pron = Pronunciations(pron_fd) voc = Vocabulary(voc_fd) lim = Vocabulary(lim_fd) logging.info("Original vocabulary: %s", voc) logging.info("Original pronunciations: %s", pron) logging.info("Original LM: %s", lm) logging.info("Limiting vocabulary: %s", lim) logging.info("Vocabulary to remove has mass: %s", lm.get_probability_of_not_words(lim.get_words())) logging.info("Vocabulary to remain has mass: %s", lm.get_probability_of_words(lim.get_words())) lm.filter_by(lim) pron.filter_by(lim) voc.filter_by(lim) logging.info("New vocabulary: %s", voc) logging.info("New pronunciations: %s", pron) logging.info("New LM: %s", lm) with meta_open(target[0].rstr(), "w") as voc_ofd, meta_open(target[1].rstr(), "w") as pron_ofd, meta_open(target[2].rstr(), "w") as lm_ofd: voc_ofd.write(voc.format()) pron_ofd.write(pron.format()) lm_ofd.write(lm.format()) return None
def top_words(target, source, env): args = source[-1].read() with meta_open(source[0].rstr()) as words_ifd, meta_open(source[1].rstr()) as pron_ifd: top = ProbabilityList(words_ifd).get_top_n(args["COUNT"]) prons = Pronunciations(pron_ifd) prons.filter_by(top) with meta_open(target[0].rstr(), "w") as words_ofd, meta_open(target[1].rstr(), "w") as pron_ofd: words_ofd.write(top.format()) pron_ofd.write(prons.format()) return None
def filter_babel_gum(target, source, env): with meta_open(source[0].rstr()) as pron_ifd, meta_open(source[1].rstr()) as prob_ifd, meta_open(source[2].rstr()) as lim_ifd: pron = Pronunciations(pron_ifd) logging.info("Old pronunciations: %s", pron) prob = ProbabilityList(prob_ifd) logging.info("Old probabilities: %s", prob) filt = Vocabulary(lim_ifd) logging.info("Correct words: %s", filt) pron.filter_by(filt) logging.info("New pronunciations: %s", pron) prob.filter_by(filt) logging.info("New probabilities: %s", prob) with meta_open(target[0].rstr(), "w") as pron_ofd, meta_open(target[1].rstr(), "w") as prob_ofd: pron_ofd.write(pron.format()) prob_ofd.write(prob.format()) return None
def replace_pronunciations(target, source, env): """ Takes two pronunciation files, and replaces pronunciations in the first with those from the second, for overlapping words. Returns a new vocabulary file and pronunciation file. """ with meta_open(source[0].rstr()) as old_fd, meta_open(source[1].rstr()) as repl_fd: old = Pronunciations(old_fd) repl = Pronunciations(repl_fd) logging.info("Old pronunciations: %s", old) logging.info("Replacement pronunciations: %s", repl) old.replace_by(repl) logging.info("New pronunciations: %s", old) with meta_open(target[0].rstr(), "w") as voc_ofd, meta_open(target[1].rstr(), "w") as pron_ofd: voc_ofd.write(old.format_vocabulary()) pron_ofd.write(old.format()) return None
def pronunciation_performance(target, source, env): with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as gen_fd: tp, fp, fn = 0, 0, 0 gold = Pronunciations(gold_fd) gen = Pronunciations(gen_fd) logging.info("gold phone inventory: %s", " ".join(gold.phones())) logging.info("generated phone inventory: %s", " ".join(gen.phones())) for x in gen.get_words().intersection(gold.get_words()): gold_prons = set(map(tuple, [map(str.lower, y) for y in gold[x].values()])) gen_prons = set(map(tuple, [map(str.lower, y) for y in gen[x].values()])) for go_p in gold_prons: if go_p in gen_prons: tp += 1 else: fn += 1 for ge_p in gen_prons: if ge_p not in gold_prons: fp += 1 prec = float(tp) / (tp + fp) rec = float(tp) / (tp + fn) f = 2 * (prec * rec) / (prec + rec) with meta_open(target[0].rstr(), "w") as ofd: ofd.write("%f %f %f\n" % (prec, rec, f)) return None
def pronunciations_to_vocabulary(target, source, env): with meta_open(source[0].rstr()) as ifd: d = Pronunciations(ifd) with meta_open(target[0].rstr(), "w") as ofd: ofd.write(d.format_vocabulary()) return None
def augment_language_model(target, source, env): """ Input: old language model, old pronunciations, new pronunciations| ** old language model, old pronunciations, new pronunciations Output: new language model, new vocab, new pronunciations """ #from arpabo import Arpabo, Pronunciations weighted = len(source) == 5 old_prons = Pronunciations(meta_open(source[0].rstr())) old_lm = Arpabo(meta_open(source[1].rstr())) new_prons = Pronunciations(meta_open(source[2].rstr())) mass = source[-1].read() logging.info("Old LM: %s", old_lm) logging.info("Old Pronunciations: %s", old_prons) logging.info("Words to add: %s", new_prons) if weighted: new_probs = ProbabilityList(meta_open(source[3].rstr())) logging.info("Words to add (probabilities): %s", new_probs) old_prons.add_entries(new_prons) if weighted: old_lm.add_unigrams_with_probs(new_probs, mass) else: old_lm.add_unigrams(new_prons.get_words(), mass) logging.info("New Pronunciations: %s", old_prons) logging.info("New LM: %s", old_lm) logging.info("New words have weight %s", old_lm.get_probability_of_words(new_prons.get_words())) logging.info("Old words have weight %s", old_lm.get_probability_of_not_words(new_prons.get_words())) with meta_open(target[0].rstr(), "w") as new_vocab, meta_open(target[1].rstr(), "w") as new_prons, meta_open(target[2].rstr(), "w") as new_lm: new_lm.write(old_lm.format()) new_vocab.write(old_prons.format_vocabulary()) new_prons.write(old_prons.format()) return None