Esempio n. 1
0
def epi_test():
    # Just look at Dutch phonemes
    epi = Epitran('nld-Latn')
    print(epi.transliterate('Werkt'))

    # Look at English phonemes first, also consider Dutch.
    # Here to see if flite works.
    backoff = Backoff(['eng-Latn', 'nld-Latn'])
    print(backoff.transliterate('Werkt'))
Esempio n. 2
0
def compare_epi_to_dict():
    dict_file = open('g2p_dictionary/dutch_dic_to_phonetic.1.json', 'r')
    g2p_dict = json.load(dict_file)
    epi = Epitran('nld-Latn')
    n = 0
    for key in g2p_dict:
        dic_word = g2p_dict[key]
        epi_word = epi.transliterate(key)
        if (dic_word != epi_word):
            n += 1
            # fstring = "Key: %s\nDic: %s\nEpi: %s\n" % (key, dic_word, epi_word)
            # print (fstring)
    print(n)
Esempio n. 3
0
from epitran import Epitran
from util import lang2ISO
import codecs
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Clean Data")
    parser.add_argument('fn', metavar='fn')
    parser.add_argument('lang', metavar='lang')

    args = parser.parse_args()
    fn = args.fn
    lang = args.lang

    with codecs.open(fn, "r", encoding='utf-8') as file:
        lines = [l.strip().split('\t') for l in\
                 file]

        iso = lang2ISO(lang)
        epi = Epitran(iso)
        for lemma, wf, tags in lines:
            if lemma.isdigit() or wf.isdigit():
                print("Digit! %s, %s" % (lemma, wf))
            elif epi.transliterate(lemma) and epi.transliterate(wf):
                pass
            else:
                print("Cannot transliterate! %s, %s" % (lemma, wf))
Esempio n. 4
0
    elif lang in ['it', 'es', 'fr', 'uk', 'pl', 'ru']:
        lang2code = {
            'it': 'ita-Latn',
            'es': 'spa-Latn',
            'fr': 'fra-Latn',
            'ru': 'rus-Cyrl',
            'uk': 'ukr-Cyrl',
            'pl': 'pol-Latn'
        }
        transcriber = Epitran(lang2code[lang])
        ipa_col = f'{lang}_ipa'
        form_col = 'desc_form'
        # Italian doesn't have phonemic diphthongs.
        merge_vowels = lang != 'it'
        desc[ipa_col] = desc[form_col].apply(
            lambda s: i2t(transcriber.transliterate(s).replace('ˈ', '').
                          replace('ˌ', '').replace("'", ''),
                          merge_vowels=merge_vowels))
        to_normalize = list()
        if lang == 'ru':
            to_normalize = [('á', 'a'), ('ó', 'o'), ('é', 'e'), ('ú', 'u'),
                            ('ɨ́', 'ɨ'), ('í', 'i'), ('t͡ɕʲ', 't͡ɕ'),
                            ('ʂʲ', 'ʂ')]
        elif lang == 'uk':
            to_normalize = [('ɑ́', 'ɑ'), ('ɔ́', 'ɔ'), ('ɛ́', 'ɛ'), ('í', 'i'),
                            ('ú', 'u'), ('ɪ́', 'ɪ')]
        elif lang == 'pl':
            to_normalize = [('ʐ̇', 'ʐ'), ('t͡ʂ', 'ʈ͡ʂ')]
        elif lang == 'fr':
            to_normalize = [('ù', 'u'), ('â', 'a')]

        def normalize(s):