def convert(i): word = "" if ipa.isin_cmu(i): word = ipa.convert(i, stress_marks="primary") if i in mapEnglishToAnglish: if ipa.isin_cmu(mapEnglishToAnglish[i]): word = ipa.convert(mapEnglishToAnglish[i], stress_marks="primary") if word == "": return [] word = word.replace("ɑ", "aa") word = word.replace("æ", "a") word = word.replace("mən", "mon") word = word.replace("ə", "e") word = word.replace("ˈ", "") word = word.replace("ɛ", "e") word = word.replace("ɪ", "i") word = word.replace("ɔ", "o") word = word.replace("ʊ", "u") word = word.replace("ʧ", "ch") word = word.replace("ŋ", "ng") word = word.replace("ʃ", "sh") word = word.replace("j", "y") word = word.replace("ʤ", "j") word = word.replace("ʒ", "j") # z word = word.replace("θ", "þ") return [(i, word)]
def convert_english_to_ipa(english_string): if ' ' in english_string: eng_to_ipa_string = ipa.convert(english_string) english_to_ipa_split = eng_to_ipa_string.split() return english_to_ipa_split.split(' ') else: english_string = ipa.convert(english_string) return english_string
def pigify(word): if word in TRANSFORMS: return ipa.convert(TRANSFORMS[word]) if word in SKIPS: return ipa.convert(word) elif word in CUSTOM_CONVERSIONS: return CUSTOM_CONVERSIONS[word] else: #if word[-2:] == "'s": # word = word[:-2] # suffix = "eɪz" #else: suffix = "eɪ" word_parts = consonant_cluster(ipa.convert(word)) return word_parts[1] + word_parts[0] + suffix
def taoipa(request): quizs = Quizziz.objects.all() for quiz in quizs: phienam = '/' + ipa.convert(quiz.word) + '/' quiz.ipa = phienam quiz.save() return redirect('/')
def write_wordlist_ipa(overwrite=True): # min/max count of numbers min_length = 2 max_length = 5 words = get_word_list() words = random.sample(words, 100) major_dict = {} with tqdm(words) as tqdm_it: for n, word in enumerate(tqdm_it): ipa = convert(word) nums = word_to_nums(ipa) # don't save this value if too many/few numbers if '*' not in ipa and min_length <= len(nums) <= max_length: major_dict[word] = nums # save the dictionary to a json file out_fname = 'wordlist_major.json' with open(out_fname, 'r', encoding='utf-8') as outfile: old_dict = json.load(outfile) old_dict.update(major_dict) if overwrite: old_dict = major_dict with open(out_fname, 'w', encoding='utf-8') as outfile: json.dump(old_dict, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def draw_word(text, main_color_rgb): SPACE = 40 phonics = '/{}/'.format(ipa.convert(text)) word_font = ImageFont.truetype('Myriad Pro Bold.ttf', 70) word_size = word_font.getsize(text) phonics_font = ImageFont.truetype('CALIBRI.TTF', 50) phonics_size = phonics_font.getsize(phonics) size = (max(word_size[0], phonics_size[0]), word_size[1] + phonics_size[1] + SPACE) img = Image.new('RGBA', size, (255, 255, 255, 255)) draw = ImageDraw.Draw(img, 'RGBA') draw.text((int((size[0] - word_size[0]) / 2), 0), text=text, fill=main_color_rgb, font=word_font, stroke_width=2, stroke_fill='black') draw.text((int( (size[0] - phonics_size[0]) / 2), int( (size[1] - word_size[1]) / 2) + SPACE), text=phonics, fill=main_color_rgb, font=phonics_font, stroke_width=1, stroke_fill='black') #im_pil.save(img_path, image.format) #img.show() return img
def ssmlify(sentence): res = '<speak><prosody rate="slow">\n' swear_data = """<phoneme alphabet="ipa" ph="%s"/> <say-as interpret-as="expletive">%s</say-as> <phoneme alphabet="ipa" ph="%s"/>\n""" data = '<phoneme alphabet="ipa" ph="%s">%s</phoneme>' data_original = '<phoneme alphabet="ipa" ph="%s"/>%s' spoonerism = spoonerify(sentence) spoonerism_ipa = spoonerify(ipa.convert(sentence)) splitted = spoonerism.split() splitted_ipa = spoonerism_ipa.split() broken = False for (i, word) in enumerate(splitted_ipa): original = splitted[i] if original.lower() in SWEARS: res += swear_data % (consonant_cluster(word)[0], original[:-2], word[-1]) continue if not (i < len(splitted) - 1 and i > 0): if word[-1] != '*': res += data % (word, original) else: word = word[:-1] res += data_original % consonant_cluster(word) else: res += original if i < len(splitted) - 1: res += '<break strength="x-weak"/>' res += '\n' res += '</prosody></speak>\n' return res
def list_phonetic_rhymes(word): list_phonetic = [] for word in pronouncing.rhymes(word): phonetic = p.convert(word) if "*" not in phonetic: list_phonetic.append(phonetic) return list_phonetic
def upload(): txt = "" _ipa = "" dir = 'C:/Users/raymondzhao/myproject/dev.speech/speech/audio/' #file = dir + 'recording.wav' file = dir + 'english81.wav' exists = os.path.isfile(file) if exists: """ f = open(file, "wb") # the actual file is in request.body f.write(request.data) f.close() demo = sr.AudioFile(file) """ demo = file txt = get_post(demo) _ipa = ipa.convert(txt) else: print("No file") return render_template('ispeech/record.html', posts=txt, _ipa=_ipa)
def get_caption(): global current_result if current_result > 0: return ipa.convert( open(text_output_dir + "/caption-" + str(current_result) + ".txt").readline()) return ""
def phonetic_difficulty(word): word = clean(word) word_ipa = ipa.convert(word) word_ipa = clean(word_ipa) vowels = 'iɪeɛæɑouʊʌə' dorsals = 'kŋgxw' fri_aff_liq = 'rfvszxhθðʃʒlʧʤ' rhotic = 'r' phon = 0 i = 0 for i in range(len(word_ipa)): if word_ipa[i] in dorsals: phon += 1 if word_ipa[i] in fri_aff_liq: phon += 1 if word_ipa[i] in rhotic: phon += 1 if word_ipa[len(word_ipa) - 1] not in vowels: phon += 1 if ipa.syllable_count(word) >= 3: phon += 1 if check_clusters(word_ipa) == True: phon += 1 if homorganic(word_ipa) == False: phon += 1 return phon
def rec(): global word global word_list while (True): try: conn = sqlite3.connect('words.db', check_same_thread=False) r = sr.Recognizer() r.energy_threshold = 3000 mic = sr.Microphone(device_index=0) with mic as source: print("listening...") r.adjust_for_ambient_noise(source) audio = r.listen(source) speech = r.recognize_google(audio) text = ipa.convert(speech) if (text != word): word = text conn.execute("INSERT INTO words (word, ipa) VALUES(?,?)", (text, speech)) conn.commit() conn.close() else: word = '?' except sr.UnknownValueError: word = '?'
def convert_to_phonetic(lines): phon_lines = [] for line in lines: # Strip punctuation. line = eng_to_ipa.convert(line, keep_punct=False) # line = line.translate(str.maketrans('', '', string.punctuation)) phon_lines.append(line) return phon_lines
def convert_to_ipa(texts): epi = epitran.Epitran('eng-Latn') for text_mel_pair in texts: text_mel_pair[1] = ipa.convert(text_mel_pair[1]) foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1]) for word in foreign_words: text_mel_pair[1] = text_mel_pair[1].replace( word, epi.transliterate(word[0:len(word) - 1]))
def str_to_phonetics(texts, *params): result = [] for text in texts: text = str(text) # result.append(" ".join([ipa.convert(word) for word in words])) result.append(ipa.convert(text)) return result
def transcribe(self, word, special=False, phonetic=False, derivative=False, debug=False) -> str: pre, pure_word, post = '', '', '' for l in word: if (ord(l) >= 0x41 and ord(l) <= 0x5A) or ( ord(l) >= 0x61 and ord(l) <= 0x7A) or (ord(l) == 0x27): pure_word += l elif special: if l == word[0]: pre = l elif l == word[-1]: post = l # 발음 구하기 self.phonetic = eng_to_ipa.convert(pure_word) if '*' in self.phonetic: return word if self.errata.get(pure_word): self.phonetic = self.errata.get(pure_word) # 음절 나누기 self.syllabify() # for bug tracking if debug: print(self.phonetic, self.phonetic_syllabic, self.CV) # 한글로 치환 self.transcribed = '' for i in self.phonetic_syllabic: if i == '/': self.transcribed += i else: if self.consonants.get(i): self.transcribed += self.consonants.get(i) elif self.vowels.get(i): self.transcribed += self.vowels.get(i) # 음운 변화 self.phonetic_change() hangul = '' # 첫가끝 결합 for i in self.transcribed.split('/'): hangul += self.compose(i) if derivative: output = '[{}]<{}>{}'.format(self.phonetic, self.phonetic_syllabic, hangul) elif phonetic: output = '[{}]{}'.format(self.phonetic, hangul) else: output = pre + hangul + post return output
def check_clusters(word): word = ipa.convert(word) pattern = '[^iɪeɛæɑouʊʌə\d\W]{2,}' match = re.search(pattern, word) if match: is_there_cluster = True else: is_there_cluster = False return is_there_cluster
def convert_to_ipa(texts): print("Converting training files to IPA notation...") epi = epitran.Epitran('eng-Latn', ligatures=True) for text_mel_pair in texts: text_mel_pair[1] = ipa.convert(english_cleaners(text_mel_pair[1])) foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1]) for word in foreign_words: text_mel_pair[1] = text_mel_pair[1].replace( word, epi.transliterate(word[0:len(word) - 1]))
def columnize(df,golden,response): df[response] = [i.replace('?', '') for i in df[response]] df[response] = [i.replace('x', '').lower() for i in df[response]] # Raw Match df["match.correct"] = np.where(df[response] == df[golden], 1, 0) # Tokenize df['token.original'] = [nltk.word_tokenize(i) for i in df[golden]] df['token.response'] = [nltk.word_tokenize(i) for i in df[response]] # Lemmatize df['stem.original'] = [snowball_stemmer.stem(i) for i in df[golden]] df['stem.response'] = [snowball_stemmer.stem(i) for i in df[response]] # Clean df[response] = [i.replace('emare', "i'mer") for i in df[response]] # IPA columns df['trans.original'] = df[golden].map(lambda x: ipa.convert(x)) df['trans.response'] = df[response].map(lambda x: ipa.convert(x)) df["trans.response"] = [i.replace('*', '') for i in df["trans.response"]] return df
def make_line(tag_number, ru, en): samples = '' try: samples = get_samples(en) except requests.exceptions.HTTPError: print('Проблема с полючением примеров.') samples = '' return ['0%', f'Слова {tag_number}', en, ipa.convert(en), ru, '', samples]
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory): # Make synthesis paths if not os.path.exists(output_directory): os.makedirs(output_directory) print("Creating directory " + output_directory + "...") hparams = create_hparams() hparams.sampling_rate = 22050 print("Loading models...") model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) genlist = [] with open(text_file) as file: for line in file: genlist.append(line.strip()) for entry in genlist: wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav" epi = epitran.Epitran('eng-Latn', ligatures = True) if hparams.preprocessing == "ipa": entry = ipa.convert(english_cleaners(entry)) foreign_words = re.findall(r"[^ ]{0,}\*", entry) for word in foreign_words: entry = entry.replace(word, epi.transliterate(word[0:len(word)-1])) if hparams.preprocessing == "arpabet": entry = make_arpabet(entry) # Text sequencer if hparams.preprocessing is not None: sequence = np.array(text_to_sequence(entry, None))[None, :] else: sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # Synthesis mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_denoised = denoiser(audio, strength=0.01)[:, 0] # Save audio print ("Saving " + wav_name) write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
def word_to_number(word: str) -> str: letters = major.keys() pattern = re.compile(rf"([{''.join(letters)}])\1*") converted = ipa.convert(word) groups = pattern.findall(converted) logging.debug(f'{word} -> {converted} -> {groups}') n = ''.join([str(major[_]) for _ in groups]) if converted.endswith('*'): n += '*' return n
def get_word_IPA(file): f = open("word_ipa_dict.pkl", "wb") word_list = [line.strip() for line in open(file)] #print(word_list) WORD_IPA_DICT = dict() for word in word_list: if (ipa.isin_cmu(word) == True): WORD_IPA_DICT[word] = ipa.convert(word) pickle.dump(WORD_IPA_DICT, f) f.close()
def getRow(word): global TARGET_LANGUAGE try: transcrypt = ipa.convert(word) try: translator = google_translator() translation = translator.translate(word, lang_tgt=TARGET_LANGUAGE) except: try: translation = translators.deepl(word, from_language='en', to_language=TARGET_LANGUAGE) except: try: translation = translators.bing(word, from_language='en', to_language=TARGET_LANGUAGE) except: try: translation = translators.baidu( word, from_language='en', to_language=TARGET_LANGUAGE) except: translation = 'not available' try: r = requests.post( "https://www.wordhippo.com/what-is/process-form.html", data={ 'word': word, 'action': 'Sentences' }) soup = BS(r.text, features="html.parser") sentence = soup.findAll( 'tr', {'id': 'gexv2row1'})[0].findAll('td')[0].text except: try: r = urllib.request.urlopen( 'https://sentence.yourdictionary.com/' + word).read().decode("utf8") soup = BS(r, features="html.parser") sentence = soup.findAll('div', {'class': 'sentence-item'})[0].text except: sentence = 'not available' # i += 1 # printProgressBar(i, number_of_words, prefix='Progress:', suffix='Complete', length=50) return { 'word': word, 'transcrypt': transcrypt, 'translation': translation, 'sentence': sentence } except: pass
def populate_ipa_dict_from_text(text): """Get all IPA information from eng_to_ipa and save to ipa_dict.""" ipa_dict = load_ipa_dict() words = preprocess(text).split() for word in set(words) - set(ipa_dict.keys()): ipa = eng_to_ipa.convert(word, retrieve_all=True, keep_punct=False, stress_marks=False) ipa_dict[word] = ipa save_ipa_dict(ipa_dict)
def name2ipa(self, name_dict): ''' espeak transcribes names into ipa_chars this takes a while ''' start = time.time() name_df = pd.DataFrame.from_dict(name_dict, orient="index", columns=["name", "sex"]) name_df["ipa"] = name_df["name"].apply(lambda x: ipa.convert(x)) print("{} hours".format((time.time() - start) / 3600.)) return name_df
def homorganic(word): dor_lab_cor = 'kŋgxwpbfvmwtdsznlʧʤrj' word = ipa.convert(word) print(word) i = 0 for i in range(len(word) - 1): if word[i] in dor_lab_cor and word[i + 1] in dor_lab_cor: homorganic = True break else: homorganic = False return homorganic
def phonetics(textfile): file_phonetics = open(textfile[:-4] + "_phonetics.txt", "w", encoding="utf-8") with open(textfile, 'r', encoding="utf-8") as file: lines = file.readlines() for line in lines: file_phonetics.write(p.convert(line) + "\n") file_phonetics.close() print(lines)
def new(input): if len(input) == 0: return input #print(input) #convert word to IPA input = input.lower() trans = ipa.convert(input) if trans[len(trans) - 1] == '*': print(trans) print("error, no transcription") return input #print(trans) trans = trans.replace("ˈ", "") trans = trans.replace("ˌ", "") trans = trans.replace("iɛ", "ie") trans = trans.replace("iɪ", "i") trans = trans.replace("æŋ", "en") trans = trans.replace("ŋ", "ng") trans = trans.replace("ð", "th") trans = trans.replace("θ", "th") trans = trans.replace("ʃ", "sh") trans = trans.replace("ngg", "ng") trans = trans.replace("ngk", "nk") trans = trans.replace("oʊ", "1") trans = trans.replace("aɪ", "2") trans = trans.replace("aʊ", "3") #step 1 generate possible spellings ar1 = [] poss(input, ar1, trans, '', 0) #step 2 add possibilites associated with g(ei) and c(ei) ar2 = [] for el in ar1: #print(el) filter(input, ar2, el, '', 0) #rank the possible spellings based on their edit distance from #the input word. Uses modified damerau-levenshtein algorithm best = 1000 out = '' input = input[0] + input[1:-1].replace('y', 'i') + input[-1:] for el in ar2: score = rank.dLev(el, input) #print(el, "\t", score) if score < best: out = el best = score return out
def convert_section(cls, line: str, conversion_key: str) -> str: if not conversion_key or conversion_key == '[DEFAULT]': pass elif conversion_key == '[TO_IPA]': line = cls._prepare_ipa(line) line = ipa.convert(line, keep_punct=False) line = cls._clean_ipa(line) else: logger.info(f"Unknown conversion_key: {conversion_key}") return line
# https://github.com/mphilli/English-to-IPA # python3 setup.py install # python3 words_with_frequency_and_translation_and_ipa.py import json import eng_to_ipa as ipa with open('words_with_frequency_and_translation.json', 'r') as f: data = json.load(f) for key, val in data.items(): phonetic_symbol = ipa.convert(key, keep_punct=False, retrieve_all=False, stress_marks="primary") if phonetic_symbol: val["ipa"] = phonetic_symbol file = open("words_with_frequency_and_translation_and_ipa.json", "w") file.write(json.dumps(data, ensure_ascii=False)) file.close()