Esempio n. 1
0
def epi_test():
    # Just look at Dutch phonemes
    epi = Epitran('nld-Latn')
    print(epi.transliterate('Werkt'))

    # Look at English phonemes first, also consider Dutch.
    # Here to see if flite works.
    backoff = Backoff(['eng-Latn', 'nld-Latn'])
    print(backoff.transliterate('Werkt'))
Esempio n. 2
0
def compare_epi_to_dict():
    dict_file = open('g2p_dictionary/dutch_dic_to_phonetic.1.json', 'r')
    g2p_dict = json.load(dict_file)
    epi = Epitran('nld-Latn')
    n = 0
    for key in g2p_dict:
        dic_word = g2p_dict[key]
        epi_word = epi.transliterate(key)
        if (dic_word != epi_word):
            n += 1
            # fstring = "Key: %s\nDic: %s\nEpi: %s\n" % (key, dic_word, epi_word)
            # print (fstring)
    print(n)
Esempio n. 3
0
    def __init__(self, code, space_names):
        """Constructs VectorWithIPASpace object

        A VectorWithIPASpace object takes orthographic words, via the
        word_to_segs method, and returns a list of tuples consisting of category
        (letter or punctuation), lettercaase, orthographic form, phonetic form,
        id within an IPA space, and articulatory feature vector.

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
                                codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.space = Space(code, space_names)
Esempio n. 4
0
def get_tgt_code_and_transcriber(
        target: str,
        pron_dict: Optional[dict] = None,
        need_transcriber: bool = True) -> Tuple[str, G2P_func]:
    if target == 'roa-opt':
        tgt_code = 'roa_opt'
    else:
        tgt_code = lookup(target).alpha_3

    if not need_transcriber:
        tgt_g2p = None
    # Use epitran.
    elif pron_dict is None:
        if tgt_code in [
                'ita', 'spa', 'por', 'fra', 'cat', 'ron', 'deu', 'nld', 'swe'
        ]:
            epi_code = f'{tgt_code}-Latn'
        else:
            raise ValueError(f'language {target} not supported.')
        tgt_g2p = Epitran(epi_code).transliterate
    # Use pronunciation dictionary.
    else:
        # Return None if entry not found.
        tgt_g2p = lambda token: pron_dict.get((tgt_code, token), None)

    return tgt_code, tgt_g2p
Esempio n. 5
0
	def get_data_stream(self, min_count=1, min_length=1):
		with open(self._file, 'r') as text, \
			open(self._dict_file, 'r') as g2p_file:
			g2p_dict = json.load(g2p_file)
			g2p_dict = {k.lower():v for k,v in g2p_dict.items()}
			epi = Epitran(self._epi_code)

			for line in text:
				list_phonetic_words = self.transliterate_line(line, g2p_dict, epi)
				if (self.validate_line(list_phonetic_words, min_count, min_length)):
					yield (list_phonetic_words)
Esempio n. 6
0
	def set_corpus(self):
		with open(self._file, 'r') as text, \
			open(self._dict_file, 'r') as g2p_file:
			g2p_dict = json.load(g2p_file)
			g2p_dict = {k.lower():v for k,v in g2p_dict.items()}
			epi = Epitran(self._epi_code)

			for line in text:
				line_phonetic = self.transliterate_line(line, g2p_dict, epi)
				for word in line_phonetic:
					self._count_table[word] += 1
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(prog="words2ipa.py")
    parser.add_argument("language", help="epitran language code (e.g., eng-Latn)")
    parser.add_argument(
        "--print-word", action="store_true", help="Print word before IPA"
    )
    parser.add_argument("--sep", help="Separator between IPA symbols (default: none)")
    args = parser.parse_args()

    e = Epitran(args.language)

    print("Reading words from stdin...", file=sys.stderr)
    for word in sys.stdin:
        word = word.strip()
        if word:
            if args.print_word:
                print(word, end=" ")

            ipa = e.trans_list(word)
            if args.sep:
                print(args.sep.join(ipa))
            else:
                print("".join(ipa))
Esempio n. 8
0
class VectorsWithIPASpace(object):
    def __init__(self, code, space_names):
        """Constructs VectorWithIPASpace object

        A VectorWithIPASpace object takes orthographic words, via the
        word_to_segs method, and returns a list of tuples consisting of category
        (letter or punctuation), lettercaase, orthographic form, phonetic form,
        id within an IPA space, and articulatory feature vector.

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
                                codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.space = Space(code, space_names)

    def word_to_segs(self, word, normpunc=False):
        """Returns feature vectors, etc. for segments and punctuation in a word

        Args:
            word (unicode): Unicode string representing a word in the
                            orthography specified when the class is
                            instantiated
            normpunc (bool): normalize punctuation

        Returns:
            list: a list of tuples, each representing an IPA segment or a
                  punctuation character. Tuples consist of <category, lettercase,
                  orthographic_form, phonetic_form, id, feature_vector>.

                  Category consists of the standard Unicode classes (e.g. 'L'
                  for letter and 'P' for punctuation). Case is binary: 1 for
                  uppercase and 0 for lowercase.
        """
        segs = self.epi.word_to_tuples(word, normpunc)
        new_segs = []
        for cat, case, orth, phon, id_vec_list in segs:
            if not phon and normpunc:
                if orth in self.epi.puncnorm:
                    orth = self.epi.puncnorm[orth]
            for s, vector in id_vec_list:
                if s in self.space:
                    id_ = int(self.space[s])
                elif orth in self.space:
                    id_ = int(self.space[orth])
                else:
                    id_ = -1
                new_segs.append((cat, case, orth, phon, id_, vector))
        return new_segs
Esempio n. 9
0
    def __init__(self, code, space_names):
        """Construct a Space object

        Space objects take strings (corresponding to segments) and return
        integers, placing them in an integer space that can be translated into
        a one-hot vector.

        The resulting object has a dictionary-like interface that supports
        indexing and iteration over "keys".

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
            codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.dict = self._load_space(space_names)
Esempio n. 10
0
def ger_to_ipa(text: str) -> str:
    if Language.GER not in _epitran_cache.keys():
        _epitran_cache[Language.GER] = Epitran('deu-Latn')
    result = _epitran_cache[Language.GER].transliterate(text)
    return result
Esempio n. 11
0
def en_to_ipa(text: str) -> str:
    if Language.ENG not in _epitran_cache.keys():
        _epitran_cache[Language.ENG] = Epitran('eng-Latn')
    result = _epitran_cache[Language.ENG].transliterate(text)
    return result
Esempio n. 12
0
from epitran import Epitran
from util import lang2ISO
import codecs
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Clean Data")
    parser.add_argument('fn', metavar='fn')
    parser.add_argument('lang', metavar='lang')

    args = parser.parse_args()
    fn = args.fn
    lang = args.lang

    with codecs.open(fn, "r", encoding='utf-8') as file:
        lines = [l.strip().split('\t') for l in\
                 file]

        iso = lang2ISO(lang)
        epi = Epitran(iso)
        for lemma, wf, tags in lines:
            if lemma.isdigit() or wf.isdigit():
                print("Digit! %s, %s" % (lemma, wf))
            elif epi.transliterate(lemma) and epi.transliterate(wf):
                pass
            else:
                print("Cannot transliterate! %s, %s" % (lemma, wf))
Esempio n. 13
0
     to_rectify = [('g', 'ɡ'), ('gʷ', 'ɡʷ'), ('h', 'x'), ('hʷ', 'xʷ'),
                   ('ɛ', 'e'), ('ɣ', 'ɡ'), ('ɔ', 'o')]
     non_transcriber = OldNorseTranscription()
     desc[ipa_col] = desc[form_col].apply(
         lambda s: non_transcriber.transcribe(s).strip('[]')).apply(
             i2t).apply(lambda lst: [replace(x, to_rectify) for x in lst])
 elif lang in ['it', 'es', 'fr', 'uk', 'pl', 'ru']:
     lang2code = {
         'it': 'ita-Latn',
         'es': 'spa-Latn',
         'fr': 'fra-Latn',
         'ru': 'rus-Cyrl',
         'uk': 'ukr-Cyrl',
         'pl': 'pol-Latn'
     }
     transcriber = Epitran(lang2code[lang])
     ipa_col = f'{lang}_ipa'
     form_col = 'desc_form'
     # Italian doesn't have phonemic diphthongs.
     merge_vowels = lang != 'it'
     desc[ipa_col] = desc[form_col].apply(
         lambda s: i2t(transcriber.transliterate(s).replace('ˈ', '').
                       replace('ˌ', '').replace("'", ''),
                       merge_vowels=merge_vowels))
     to_normalize = list()
     if lang == 'ru':
         to_normalize = [('á', 'a'), ('ó', 'o'), ('é', 'e'), ('ú', 'u'),
                         ('ɨ́', 'ɨ'), ('í', 'i'), ('t͡ɕʲ', 't͡ɕ'),
                         ('ʂʲ', 'ʂ')]
     elif lang == 'uk':
         to_normalize = [('ɑ́', 'ɑ'), ('ɔ́', 'ɔ'), ('ɛ́', 'ɛ'), ('í', 'i'),