Esempio n. 1
0
def filter_words(target, source, env):
    """
    Takes a coherent language model, pronunciation file and vocabulary file, and a second
    vocabulary file, and returns a coherent language model, pronunciation file and vocabulary 
    file limited to the words in the second vocabulary file.

    The language model probabilities are scaled such that unigrams sum to one. ***
    """
    with meta_open(source[0].rstr()) as voc_fd, meta_open(source[1].rstr()) as pron_fd, meta_open(source[2].rstr()) as lm_fd, meta_open(source[3].rstr()) as lim_fd:
        lm = Arpabo(lm_fd)
        pron = Pronunciations(pron_fd)
        voc = Vocabulary(voc_fd)
        lim = Vocabulary(lim_fd)
    logging.info("Original vocabulary: %s", voc)
    logging.info("Original pronunciations: %s", pron)
    logging.info("Original LM: %s", lm)
    logging.info("Limiting vocabulary: %s", lim)
    logging.info("Vocabulary to remove has mass: %s", lm.get_probability_of_not_words(lim.get_words()))
    logging.info("Vocabulary to remain has mass: %s", lm.get_probability_of_words(lim.get_words()))
    lm.filter_by(lim)
    pron.filter_by(lim)
    voc.filter_by(lim)
    logging.info("New vocabulary: %s", voc)
    logging.info("New pronunciations: %s", pron)
    logging.info("New LM: %s", lm)
    with meta_open(target[0].rstr(), "w") as voc_ofd, meta_open(target[1].rstr(), "w") as pron_ofd, meta_open(target[2].rstr(), "w") as lm_ofd:
        voc_ofd.write(voc.format())
        pron_ofd.write(pron.format())
        lm_ofd.write(lm.format())
    return None
Esempio n. 2
0
def top_words(target, source, env):
    args = source[-1].read()
    with meta_open(source[0].rstr()) as words_ifd, meta_open(source[1].rstr()) as pron_ifd:
        top = ProbabilityList(words_ifd).get_top_n(args["COUNT"])
        prons = Pronunciations(pron_ifd)
        prons.filter_by(top)
    with meta_open(target[0].rstr(), "w") as words_ofd, meta_open(target[1].rstr(), "w") as pron_ofd:
        words_ofd.write(top.format())
        pron_ofd.write(prons.format())
    return None
Esempio n. 3
0
def filter_babel_gum(target, source, env):
    with meta_open(source[0].rstr()) as pron_ifd, meta_open(source[1].rstr()) as prob_ifd, meta_open(source[2].rstr()) as lim_ifd:
        pron = Pronunciations(pron_ifd)
        logging.info("Old pronunciations: %s", pron)
        prob = ProbabilityList(prob_ifd)
        logging.info("Old probabilities: %s", prob)
        filt = Vocabulary(lim_ifd)
        logging.info("Correct words: %s", filt)
        pron.filter_by(filt)
        logging.info("New pronunciations: %s", pron)
        prob.filter_by(filt)
        logging.info("New probabilities: %s", prob)
        with meta_open(target[0].rstr(), "w") as pron_ofd, meta_open(target[1].rstr(), "w") as prob_ofd:
            pron_ofd.write(pron.format())
            prob_ofd.write(prob.format())
    return None
Esempio n. 4
0
def replace_pronunciations(target, source, env):
    """
    Takes two pronunciation files, and replaces pronunciations in the first with those from the second, 
    for overlapping words.  Returns a new vocabulary file and pronunciation file.
    """
    with meta_open(source[0].rstr()) as old_fd, meta_open(source[1].rstr()) as repl_fd:
        old = Pronunciations(old_fd)
        repl = Pronunciations(repl_fd)
    logging.info("Old pronunciations: %s", old)
    logging.info("Replacement pronunciations: %s", repl)
    old.replace_by(repl)
    logging.info("New pronunciations: %s", old)
    with meta_open(target[0].rstr(), "w") as voc_ofd, meta_open(target[1].rstr(), "w") as pron_ofd:
        voc_ofd.write(old.format_vocabulary())
        pron_ofd.write(old.format())
    return None
Esempio n. 5
0
def pronunciation_performance(target, source, env):
    with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as gen_fd:
        tp, fp, fn = 0, 0, 0
        gold = Pronunciations(gold_fd)
        gen = Pronunciations(gen_fd)
        logging.info("gold phone inventory: %s", " ".join(gold.phones()))
        logging.info("generated phone inventory: %s", " ".join(gen.phones()))
        for x in gen.get_words().intersection(gold.get_words()):
            gold_prons = set(map(tuple, [map(str.lower, y) for y in gold[x].values()]))
            gen_prons = set(map(tuple, [map(str.lower, y) for y in gen[x].values()]))            
            for go_p in gold_prons:
                if go_p in gen_prons:
                    tp += 1
                else:
                    fn += 1
            for ge_p in gen_prons:
                if ge_p not in gold_prons:
                    fp += 1
        prec = float(tp) / (tp + fp)
        rec = float(tp) / (tp + fn)
        f = 2 * (prec * rec) / (prec + rec)
        with meta_open(target[0].rstr(), "w") as ofd:
            ofd.write("%f %f %f\n" % (prec, rec, f))
    return None
Esempio n. 6
0
def pronunciations_to_vocabulary(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        d = Pronunciations(ifd)
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write(d.format_vocabulary())
    return None
Esempio n. 7
0
def augment_language_model(target, source, env):
    """
    Input: old language model, old pronunciations, new pronunciations|
    ** old language model, old pronunciations, new pronunciations
    Output: new language model, new vocab, new pronunciations
    """
    #from arpabo import Arpabo, Pronunciations

    weighted = len(source) == 5
        

    old_prons = Pronunciations(meta_open(source[0].rstr()))
    old_lm = Arpabo(meta_open(source[1].rstr()))
    new_prons = Pronunciations(meta_open(source[2].rstr()))
    mass = source[-1].read()

    logging.info("Old LM: %s", old_lm)
    logging.info("Old Pronunciations: %s", old_prons)
    logging.info("Words to add: %s", new_prons)

    if weighted:
        new_probs = ProbabilityList(meta_open(source[3].rstr()))
        logging.info("Words to add (probabilities): %s", new_probs)


    old_prons.add_entries(new_prons)
    if weighted:
        old_lm.add_unigrams_with_probs(new_probs, mass)
    else:
        old_lm.add_unigrams(new_prons.get_words(), mass)

    logging.info("New Pronunciations: %s", old_prons)
    logging.info("New LM: %s", old_lm)
    logging.info("New words have weight %s", old_lm.get_probability_of_words(new_prons.get_words()))
    logging.info("Old words have weight %s", old_lm.get_probability_of_not_words(new_prons.get_words()))

    with meta_open(target[0].rstr(), "w") as new_vocab, meta_open(target[1].rstr(), "w") as new_prons, meta_open(target[2].rstr(), "w") as new_lm:
        new_lm.write(old_lm.format())
        new_vocab.write(old_prons.format_vocabulary())
        new_prons.write(old_prons.format())
    return None