Example #1
0
def train_alignment(corpus='cmudict',
                    stress="unstressed",
                    subset=False,
                    delete_prob=0.01,
                    insert_prob=0.01,
                    kerberos_cmd=None):
    ### load the corpus and dict of allowables
    ab_pairs = load_pronunciations()
    alignment_scores = load_allowables(delete_prob=delete_prob,
                                       insert_prob=insert_prob)

    ### are we testing with a subset?
    if subset:
        # test with 0.1% of the corpus
        ab_pairs = random.sample(ab_pairs, len(ab_pairs) / 1000)

    ab_pairs.sort(cmp=lambda x, y: cmp(x[0], y[0]))

    # initialize the EM with the corpus and allowables
    em = ViterbiEM(ab_pairs, alignment_scores, max_iterations=100)

    # check to see if we've got a saved model
    em_fname = construct_model_fname(corpus, stress, subset)
    try:
        em.load(em_fname)
    except IOError:
        pass

    if kerberos_cmd is not None:
        os.system(kerberos_cmd)

    # run the Viterbi aligner EM, saving as we go
    while True:
        em.run_EM(1)

        em.save(em_fname)

        if em.converged:
            break

        if em.iteration_number > em.max_iterations:
            break

    return em
Example #2
0
def train_alignment(corpus='cmudict',
                    stress="unstressed",
                    subset=False,
                    delete_prob=0.01,
                    insert_prob=0.01,
                    kerberos_cmd=None):
    ### load the corpus and dict of allowables
    ab_pairs = load_pronunciations()
    alignment_scores = load_allowables(delete_prob=delete_prob,
                                       insert_prob=insert_prob)

    ### are we testing with a subset?
    if subset:
        # test with 0.1% of the corpus
        ab_pairs = random.sample(ab_pairs, len(ab_pairs)/1000)

    ab_pairs.sort(cmp = lambda x,y: cmp(x[0], y[0]))

    # initialize the EM with the corpus and allowables
    em = ViterbiEM(ab_pairs, alignment_scores, max_iterations=100)

    # check to see if we've got a saved model
    em_fname = construct_model_fname(corpus, stress, subset)
    try: em.load(em_fname)
    except IOError: pass

    if kerberos_cmd is not None:
        os.system(kerberos_cmd)

    # run the Viterbi aligner EM, saving as we go
    while True:
        em.run_EM(1)

        em.save(em_fname)

        if em.converged:
            break

        if em.iteration_number > em.max_iterations:
            break

    return em
Example #3
0
def align_all_words(corpus='cmudict', stress="unstressed", subset=False):
    ### load the corpus and dict of allowables
    ab_pairs = load_pronunciations()
    #alignment_scores = load_allowables()

    # load the model
    em = ViterbiEM([],None)
    em_fname = construct_model_fname(corpus, stress, subset)
    em.load(em_fname)

    alignment_scores = em.alignment_scores[-1]

    ### are we testing with a subset?
    if subset:
        # test with 0.1% of the corpus
        ab_pairs = random.sample(ab_pairs, len(ab_pairs)/1000)

    ab_pairs.sort(cmp = lambda x,y: cmp(x[0], y[0]))

    alignments = []
    for a,b in ab_pairs:
        v = ViterbiAligner(a, b, alignment_scores)
        paths = v.get_best_paths()
        if not len(paths):
            print "no path:",a,b
            continue

        # sort by number of insertions/deletions
        path_nones = [(path, count_nones_in_path(path)) for path in paths]
        path_nones.sort(cmp=lambda x,y: cmp(x[1], y[1]))

        min_nones = path_nones[0][1]

        for path, path_none in path_nones:

            if path_none > min_nones:
                continue

            alignments.append(path.get_elements())

    save_alignments(alignments, corpus, stress, subset)
Example #4
0
def align_all_words(corpus='cmudict', stress="unstressed", subset=False):
    ### load the corpus and dict of allowables
    ab_pairs = load_pronunciations()
    #alignment_scores = load_allowables()

    # load the model
    em = ViterbiEM([], None)
    em_fname = construct_model_fname(corpus, stress, subset)
    em.load(em_fname)

    alignment_scores = em.alignment_scores[-1]

    ### are we testing with a subset?
    if subset:
        # test with 0.1% of the corpus
        ab_pairs = random.sample(ab_pairs, len(ab_pairs) / 1000)

    ab_pairs.sort(cmp=lambda x, y: cmp(x[0], y[0]))

    alignments = []
    for a, b in ab_pairs:
        v = ViterbiAligner(a, b, alignment_scores)
        paths = v.get_best_paths()
        if not len(paths):
            print "no path:", a, b
            continue

        # sort by number of insertions/deletions
        path_nones = [(path, count_nones_in_path(path)) for path in paths]
        path_nones.sort(cmp=lambda x, y: cmp(x[1], y[1]))

        min_nones = path_nones[0][1]

        for path, path_none in path_nones:

            if path_none > min_nones:
                continue

            alignments.append(path.get_elements())

    save_alignments(alignments, corpus, stress, subset)