コード例 #1
0
ファイル: __init__.py プロジェクト: JesseAbeyta/pronouncingpy
def phones_for_word(find):
    """Get the CMUdict phones for a given word.

    Because a given word might have more than one pronunciation in the
    dictionary, this function returns a list of all possible pronunciations.

    .. doctest::

        >>> import pronouncing
        >>> pronouncing.phones_for_word("permit")
        ['P ER0 M IH1 T', 'P ER1 M IH2 T']

    :param find: a word to find in CMUdict.
    :returns: a list of phone strings that correspond to that word.
    """
    init_cmu()

    found = lookup.get(find.lower(), [])

    # If the requested word isn't in the CMU dictionary, fallback on g2p_en
    # ML to attempt to determine prononuciation
    if len(found) == 0:
        found = [" ".join(g2p_en.g2p(find))]

    return found
コード例 #2
0
def headline_choice(headlines, query):
    def levenshtein(s1, s2):
        if len(s1) < len(s2):
            return levenshtein(s2, s1)

        if len(s2) == 0:
            return len(s1)

        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    with g2p.Session():
        search_query = ''.join(
            [phoneme[:2] for phoneme in g2p.g2p(query) if phoneme != ' '])
        headlines = [
            ''.join(
                [phoneme[:2] for phoneme in g2p.g2p(text) if phoneme != ' '])
            for text in headlines
        ]

    chosen_index = 0
    min_distance = len(query)
    query = search_query
    for index, t in enumerate(headlines):
        edit_distances = []
        if len(query) > len(t):
            t, query = query, t
        if len(query) == len(t):
            t += ' '
        for i in range(0, len(t) - len(query), 2):
            substring = t[i:i + len(query)]
            edit_distances.append(levenshtein(query, substring))
        if min(edit_distances) < min_distance:
            min_distance = min(edit_distances)
            chosen_index = index
        query = search_query

    return chosen_index
コード例 #3
0
def getViseme(grapheme):
    phoneme = g2p(grapheme)

    map = p2vmap()
    viseme = []
    vistring = ""

    for k in phoneme:
        if k in map:
            viseme.append(map[k])
            # prev=k
        else:
            viseme.append(-1)
    viseme.append(-1)

    for vis in viseme:
        vistring = vistring + str(vis) + " "
    print(vistring)
    return vistring
コード例 #4
0
def add_phoneme_features(df):
    '''
    Add phonmes as a column.
    Use g2p_en (https://github.com/Kyubyong/g2p) to get the phonemes.
    Input
    - df: the pandas dataframe that has Transcript column
    Output
    - df: the pandas dataframe with the added column: phoneme
    '''
    def get_idx_of_numbers_in_sentence(sentence):
        word_list = sentence.split()
        idx_list = []
        for i, word in enumerate(word_list):
            if bool(re.search(r'\d', word)):
                idx_list.append(i)
        return idx_list

    def split_sentence_by_number(sentence):
        idx_of_numbers = get_idx_of_numbers_in_sentence(sentence)
        word_list = sentence.split()
        sentence_list = []
        sub_sentence = ""
        for i in range(len(word_list)):
            if i in idx_of_numbers:
                if len(sub_sentence) > 0:
                    sentence_list.append(sub_sentence.strip())
                    sub_sentence = ""
                sentence_list.append(word_list[i])
            else:
                sub_sentence += ' ' + word_list[i]
        sentence_list.append(sub_sentence.strip())
        return sentence_list

    past_sentence = ""
    phoneme_list = []
    with g2p.Session():
        for i, sentence in enumerate(df['Transcript']):
            if sentence != past_sentence:
                splitted_sentence = split_sentence_by_number(sentence)
                phonemes_of_sentence = []
                for sub_sentence in splitted_sentence:
                    if bool(re.search(r'\d', sub_sentence)):
                        phonemes_of_sub_sentence = g2p.g2p(sub_sentence)
                        phonemes_of_sentence.append(phonemes_of_sub_sentence)
                    else:
                        phonemes_of_sub_sentence = g2p.g2p(sub_sentence)
                        phonemes_of_sub_sentence = ','.join(
                            phonemes_of_sub_sentence)
                        phonemes_of_sub_sentence = phonemes_of_sub_sentence.split(
                            ', ,')
                        phonemes_of_sub_sentence = [
                            phonemes_chunk.split(',')
                            for phonemes_chunk in phonemes_of_sub_sentence
                        ]
                        phonemes_of_sub_sentence = [
                            phonemes for phonemes in phonemes_of_sub_sentence
                            if phonemes != ['.']
                        ]
                        phonemes_of_sentence += phonemes_of_sub_sentence

                past_sentence = sentence
                phonemes_len = len(phonemes_of_sentence)
                count = 0

            assert len(sentence.split()) == phonemes_len
            phoneme_list.append(phonemes_of_sentence[count])
            count = (count + 1) % phonemes_len
    df['phoneme'] = phoneme_list
    return df
コード例 #5
0
ファイル: main.py プロジェクト: danyechat/Flask_G2P1
def my_route():
    # page = request.args.get('page', default = 1, type = int)
    word = request.args.get('word', default='*', type=str)
    phenomes = g2p(word)
    return str(phenomes)
コード例 #6
0
labels = np.array([[1, 3, 5], [2, 4, 6], [0, 0, 0]])
print(labels[None, ...])
print(labels[None, :])

_whitespace_re = re.compile(r"\s+")
print(re.sub(_whitespace_re, " ", "nihao ma  wo hen   hao"))

text = "nihaoma wo {B IY}henhao a "
print(len(text))
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
print(_curly_re.match(text))

from g2p_en import g2p

g2p = g2p.G2p()
print(g2p('B'))
# print("==========="*2)
test = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
for i in test:
    print(g2p(i))

print(g2p('tv'))
print(g2p('ad'))
# print(g2p('fetch'))
# print(g2p('cake'))
# print(g2p('age'))
# print(g2p('banana'))
# print(g2p('ABCDEFG'))
#
# texts = ""
# with open("/home/wangyl/test.txt") as text_file:
コード例 #7
0
def prepare_txt_dict():
    speakers = vctk.available_speakers
    td = vctk.TranscriptionDataSource(hp.vctk_path, speakers=speakers)
    transcriptions = td.collect_files()
    wav_paths = vctk.WavFileDataSource(hp.vctk_path,
                                       speakers=speakers).collect_files()

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    futures = list()

    save_name_list = list()

    if not os.path.exists("processed"):
        os.mkdir("processed")

    for ind in range(len(wav_paths)):
        savename = os.path.basename(
            wav_paths[ind])[0:len(os.path.basename(wav_paths[ind])) -
                            4] + ".txt"
        savename = os.path.join("processed", savename)
        save_name_list.append(savename)
        # print(savename)
    print("Get Name Done.")

    lists_P = list()

    with g2p.Session():
        for i, text in enumerate(transcriptions):

            list_not_alpha = list()
            for ind, ele in enumerate(text):
                if (not ele.isalpha()) and (ele != ' '):
                    list_not_alpha.append(ind)

            # print(list_not_alpha)

            cnt = 0
            for ind in list_not_alpha:
                text = delete_alpha_str(text, ind - cnt)
                cnt = cnt + 1

            # print(text + "######")

            # os.path.basename(wav_paths[ind])[0:len(os.path.basename(wav_paths[ind]))-4]
            # print(os.path.basename(wav_paths[ind])[0:len(os.path.basename(wav_paths[ind]))-4])
            list_P = g2p.g2p(text)
            # print("...")
            # prepare_txt(savename, text)
            # futures.append(executor.submit(partial(prepare_txt, save_name_list[ind], list_P)))
            lists_P.append(list_P)

            if i % 100 == 0:
                print(i)

    print("Get P Done.")

    for ind, list_P in enumerate(lists_P):
        futures.append(
            executor.submit(partial(prepare_txt, save_name_list[ind], list_P)))

    print("Prepare Done.")

    words_dict = dict()

    for future in futures:
        # print(future.result())
        words_dict.update(future.result())

    # print(word_P_dict)
    with open("words_dict.txt", "w") as f:
        for key in words_dict:
            temp_str_P = str()
            for P in words_dict[key]:
                temp_str_P = temp_str_P + P + " "
            str_write = key + "    " + temp_str_P
            f.write(str_write + "\n")