def epi_test(): # Just look at Dutch phonemes epi = Epitran('nld-Latn') print(epi.transliterate('Werkt')) # Look at English phonemes first, also consider Dutch. # Here to see if flite works. backoff = Backoff(['eng-Latn', 'nld-Latn']) print(backoff.transliterate('Werkt'))
def compare_epi_to_dict(): dict_file = open('g2p_dictionary/dutch_dic_to_phonetic.1.json', 'r') g2p_dict = json.load(dict_file) epi = Epitran('nld-Latn') n = 0 for key in g2p_dict: dic_word = g2p_dict[key] epi_word = epi.transliterate(key) if (dic_word != epi_word): n += 1 # fstring = "Key: %s\nDic: %s\nEpi: %s\n" % (key, dic_word, epi_word) # print (fstring) print(n)
from epitran import Epitran from util import lang2ISO import codecs import argparse if __name__ == '__main__': parser = argparse.ArgumentParser("Clean Data") parser.add_argument('fn', metavar='fn') parser.add_argument('lang', metavar='lang') args = parser.parse_args() fn = args.fn lang = args.lang with codecs.open(fn, "r", encoding='utf-8') as file: lines = [l.strip().split('\t') for l in\ file] iso = lang2ISO(lang) epi = Epitran(iso) for lemma, wf, tags in lines: if lemma.isdigit() or wf.isdigit(): print("Digit! %s, %s" % (lemma, wf)) elif epi.transliterate(lemma) and epi.transliterate(wf): pass else: print("Cannot transliterate! %s, %s" % (lemma, wf))
elif lang in ['it', 'es', 'fr', 'uk', 'pl', 'ru']: lang2code = { 'it': 'ita-Latn', 'es': 'spa-Latn', 'fr': 'fra-Latn', 'ru': 'rus-Cyrl', 'uk': 'ukr-Cyrl', 'pl': 'pol-Latn' } transcriber = Epitran(lang2code[lang]) ipa_col = f'{lang}_ipa' form_col = 'desc_form' # Italian doesn't have phonemic diphthongs. merge_vowels = lang != 'it' desc[ipa_col] = desc[form_col].apply( lambda s: i2t(transcriber.transliterate(s).replace('ˈ', ''). replace('ˌ', '').replace("'", ''), merge_vowels=merge_vowels)) to_normalize = list() if lang == 'ru': to_normalize = [('á', 'a'), ('ó', 'o'), ('é', 'e'), ('ú', 'u'), ('ɨ́', 'ɨ'), ('í', 'i'), ('t͡ɕʲ', 't͡ɕ'), ('ʂʲ', 'ʂ')] elif lang == 'uk': to_normalize = [('ɑ́', 'ɑ'), ('ɔ́', 'ɔ'), ('ɛ́', 'ɛ'), ('í', 'i'), ('ú', 'u'), ('ɪ́', 'ɪ')] elif lang == 'pl': to_normalize = [('ʐ̇', 'ʐ'), ('t͡ʂ', 'ʈ͡ʂ')] elif lang == 'fr': to_normalize = [('ù', 'u'), ('â', 'a')] def normalize(s):