def test_find_stress(self): test_string = "reflect respect recline reduce obsessively demonstrate baseball cloud brother cobblestone " +\ "complete conspire estuary" raw_cmu = transcribe.get_cmu(test_string.split(" ")) result = [] for word_list in raw_cmu: for word in word_list: stressed = stress.find_stress(word) result.append(stressed) self.assertEqual(result, [ 'r ah ˈf l eh k t', 'r ih ˈf l eh k t', 'r ih ˈs p eh k t', 'r iy ˈs p eh k t', 'r ih ˈk l ay n', 'r ih ˈd uw s', 'aa b ˈs eh s ih v l iy', 'ˈd eh m ah n ˌs t r ey t', 'ˈb ey s ˈb ao l', 'k l aw d', 'ˈb r ah dh er', 'ˈk aa b ah l ˌs t ow n', 'k ah m ˈp l iy t', 'k ah n ˈs p ay er', 'ˈeh s ch uw ˌeh r iy' ]) # test the retrieval of only primary stress self.assertEqual( stress.find_stress("d eh1 m ah0 n s t r ey2 t", type="primary"), 'ˈd eh m ah n s t r ey t') # test the retrieval of only secondary stress self.assertEqual( stress.find_stress("d eh1 m ah0 n s t r ey2 t", type="secondary"), 'd eh m ah n ˌs t r ey t')
def cmu_to_ipa(cmu_list, mark=True, stress_marking='all'): """converts the CMU word lists into IPA transcriptions""" symbols = {"a": "ə", "ey": "e", "aa": "ɑ", "ae": "æ", "ah": "ə", "ao": "ɔ", "aw": "aʊ", "ay": "aɪ", "ch": "ʧ", "dh": "ð", "eh": "ɛ", "er": "ər", "hh": "h", "ih": "ɪ", "jh": "ʤ", "ng": "ŋ", "ow": "oʊ", "oy": "ɔɪ", "sh": "ʃ", "th": "θ", "uh": "ʊ", "uw": "u", "zh": "ʒ", "iy": "i", "y": "j"} ipa_list = [] # the final list of IPA tokens to be returned for word_list in cmu_list: ipa_word_list = [] # the word list for each word for word in word_list: if stress_marking: word = stress.find_stress(word, type=stress_marking) else: if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "": pass # do not delete token if it's all numbers else: word = re.sub("[0-9]", "", word) ipa_form = '' if word.startswith("__IGNORE__"): ipa_form = word.replace("__IGNORE__", "") # mark words we couldn't transliterate with an asterisk: if mark: if not re.sub("\d*", "", ipa_form) == "": ipa_form += "*" else: for piece in word.split(" "): marked = False unmarked = piece if piece[0] in ["ˈ", "ˌ"]: marked = True mark = piece[0] unmarked = piece[1:] if unmarked in symbols: if marked: ipa_form += mark + symbols[unmarked] else: ipa_form += symbols[unmarked] else: ipa_form += piece swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]] for sym in swap_list: if not ipa_form.startswith(sym[0]): ipa_form = ipa_form.replace(sym[0], sym[1]) ipa_word_list.append(ipa_form) ipa_list.append(sorted(list(set(ipa_word_list)))) return ipa_list