Exemple #1
0
    def test_find_stress(self):
        test_string = "reflect respect recline reduce obsessively demonstrate baseball cloud brother cobblestone " +\
            "complete conspire estuary"
        raw_cmu = transcribe.get_cmu(test_string.split(" "))
        result = []
        for word_list in raw_cmu:
            for word in word_list:
                stressed = stress.find_stress(word)
                result.append(stressed)
        self.assertEqual(result, [
            'r ah ˈf l eh k t', 'r ih ˈf l eh k t', 'r ih ˈs p eh k t',
            'r iy ˈs p eh k t', 'r ih ˈk l ay n', 'r ih ˈd uw s',
            'aa b ˈs eh s ih v l iy', 'ˈd eh m ah n ˌs t r ey t',
            'ˈb ey s ˈb ao l', 'k l aw d', 'ˈb r ah dh er',
            'ˈk aa b ah l ˌs t ow n', 'k ah m ˈp l iy t', 'k ah n ˈs p ay er',
            'ˈeh s ch uw ˌeh r iy'
        ])

        # test the retrieval of only primary stress
        self.assertEqual(
            stress.find_stress("d eh1 m ah0 n s t r ey2 t", type="primary"),
            'ˈd eh m ah n s t r ey t')
        # test the retrieval of only secondary stress
        self.assertEqual(
            stress.find_stress("d eh1 m ah0 n s t r ey2 t", type="secondary"),
            'd eh m ah n ˌs t r ey t')
def cmu_to_ipa(cmu_list, mark=True, stress_marking='all'):
    """converts the CMU word lists into IPA transcriptions"""
    symbols = {"a": "ə", "ey": "e", "aa": "ɑ", "ae": "æ", "ah": "ə", "ao": "ɔ",
               "aw": "aʊ", "ay": "aɪ", "ch": "ʧ", "dh": "ð", "eh": "ɛ", "er": "ər",
               "hh": "h", "ih": "ɪ", "jh": "ʤ", "ng": "ŋ",  "ow": "oʊ", "oy": "ɔɪ",
               "sh": "ʃ", "th": "θ", "uh": "ʊ", "uw": "u", "zh": "ʒ", "iy": "i", "y": "j"}
    ipa_list = []  # the final list of IPA tokens to be returned
    for word_list in cmu_list:
        ipa_word_list = []  # the word list for each word
        for word in word_list:
            if stress_marking:
                word = stress.find_stress(word, type=stress_marking)
            else:
                if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "":
                    pass  # do not delete token if it's all numbers
                else:
                    word = re.sub("[0-9]", "", word)
            ipa_form = ''
            if word.startswith("__IGNORE__"):
                ipa_form = word.replace("__IGNORE__", "")
                # mark words we couldn't transliterate with an asterisk:

                if mark:
                    if not re.sub("\d*", "", ipa_form) == "":
                        ipa_form += "*"
            else:
                for piece in word.split(" "):
                    marked = False
                    unmarked = piece
                    if piece[0] in ["ˈ", "ˌ"]:
                        marked = True
                        mark = piece[0]
                        unmarked = piece[1:]
                    if unmarked in symbols:
                        if marked:
                            ipa_form += mark + symbols[unmarked]
                        else:
                            ipa_form += symbols[unmarked]

                    else:
                        ipa_form += piece
            swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]]
            for sym in swap_list:
                if not ipa_form.startswith(sym[0]):
                    ipa_form = ipa_form.replace(sym[0], sym[1])
            ipa_word_list.append(ipa_form)
        ipa_list.append(sorted(list(set(ipa_word_list))))
    return ipa_list