def test_mismatch(caplog, text, mode): phn = phonemize(text, backend='espeak', language='en-us', words_mismatch=mode) if mode == 'ignore': assert phn == ['haʊ ɑːɹ juː ', 'aɪ hɐvbɪn bɪzi ', 'aɪ woʊntɐv taɪm '] messages = [msg[2] for msg in caplog.record_tuples] assert len(messages) == 1 assert 'words count mismatch on 67.0% of the lines (2/3)' in messages elif mode == 'remove': assert phn == ['haʊ ɑːɹ juː ', '', ''] messages = [msg[2] for msg in caplog.record_tuples] assert len(messages) == 2 assert 'words count mismatch on 67.0% of the lines (2/3)' in messages assert 'removing the mismatched lines' in messages elif mode == 'warn': assert phn == ['haʊ ɑːɹ juː ', 'aɪ hɐvbɪn bɪzi ', 'aɪ woʊntɐv taɪm '] messages = [msg[2] for msg in caplog.record_tuples] assert len(messages) == 3 assert ('words count mismatch on line 2 (expected 4 words but get 3)' in messages) assert ('words count mismatch on line 3 (expected 4 words but get 3)' in messages) assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
def phonemize_labels(file_name, column_name, language): """ Phonemize function: This function allow to convert text to phonemes. You need to be sure to have installed the backend beforehand (sudo apt-get install festival espeak-ng mbrola). Parameters: ----------- file_name: str Name of the tsv file that contains the sentences we need to phonemize column_name: str Name of the columns that contains the sentences we want to phonemize language: str Language of the sentences. See https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md for reference """ data = pd.read_csv(file_name, sep='\t') data['sentence_phonemes'] = phonemize(data[column_name], language=language, backend='espeak', language_switch='remove-flags', njobs=8) data.to_csv(file_name, sep='\t')
def get_phone_string(self, text): utt = clean(text, fix_unicode=True, to_ascii=False, lower=False, lang=self.clean_lang) self.expand_abbrevations(utt) utt = utt.replace("_SIL_", "~") phones = phonemizer.phonemize(utt, language_switch='remove-flags', backend="espeak", language=self.g2p_lang, preserve_punctuation=True, strip=True, punctuation_marks=';:,.!?¡¿—…"«»“”~', with_stress=self.use_stress).replace(";", ",") \ .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \ .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~") phones = re.sub("~+", "~", phones) if not self.use_prosody: phones = phones.replace("ˌ", "").replace("ː", "").replace( "ˑ", "").replace("˘", "").replace("|", "").replace("‖", "") if not self.use_word_boundaries: phones = phones.replace(" ", "") return phones + "#"
def text2phone(text, char2code): seperator = phonemizer.separator.Separator('', '', ' ') ph = phonemizer.phonemize(text, separator=seperator) ph = ph.split(' ') ph.remove('') result = [char2code[p] for p in ph] return torch.LongTensor(result)
def setup(self): # just a name shortcut self.p = lambda text: phonemize(text, language='en-us', backend='festival', strip=True, separator=separator.Separator( ' ', '|', '-'))
def english_cleaners2(text): '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' text = convert_to_ascii(text) text = lowercase(text) text = expand_abbreviations(text) phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) phonemes = collapse_whitespace(phonemes) return phonemes
def string_to_tensor(self, text, view=False): """ Fixes unicode errors, expands some abbreviations, turns graphemes into phonemes and then vectorizes the sequence as IDs to be fed into an embedding layer """ # clean unicode errors, expand abbreviations, handle emojis etc. utt = clean(text, fix_unicode=True, to_ascii=False, lower=False, lang=self.clean_lang) self.expand_abbreviations(utt) # if an aligner has produced silence tokens before, turn # them into silence markers now so that they survive the # phonemizer: utt = utt.replace("_SIL_", "~") # phonemize phones = phonemizer.phonemize(utt, language_switch='remove-flags', backend="espeak", language=self.g2p_lang, preserve_punctuation=True, strip=True, punctuation_marks=';:,.!?¡¿—…"«»“”~/', with_stress=self.use_stress).replace(";", ",").replace("/", " ") \ .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \ .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~") phones = re.sub("~+", "~", phones) if not self.use_prosody: # retain ~ as heuristic pause marker, even though all other symbols are removed with this option. # also retain . ? and ! since they can be indicators for the stop token phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \ .replace("˘", "").replace("|", "").replace("‖", "") if not self.use_word_boundaries: phones = phones.replace(" ", "") else: phones = re.sub(r"\s+", " ", phones) if view: print("Phonemes: \n{}\n".format(phones)) phones_vector = list() # turn into numeric vectors for char in phones: if self.allow_unknown: phones_vector.append( self.ipa_to_vector.get(char, self.default_vector)) else: try: phones_vector.append(self.ipa_to_vector[char]) except KeyError: print("unknown phoneme: {}".format(char)) if self.use_explicit_eos: phones_vector.append(self.ipa_to_vector["end_of_input"]) return torch.LongTensor(phones_vector).unsqueeze(0)
def load_transcripts(path: str, split: str, use_percentage: float) -> dict: dataset = pd.read_csv(os.path.join(path, "{}.tsv".format(split)), sep='\t') length = len(dataset) m60_48, _ = load_phone_map() data = {} for file_name, text in tqdm( dataset[['path', 'sentence']].values[:int(use_percentage * (length))]): phn = phonemize(text=text.encode('ascii', 'ignore').decode('ascii'), language='en-us', backend='festival', separator=phone_separator).split() phonemes = [m60_48[p] for p in phn if p in m60_48] data[os.path.join(path, 'clips', file_name)] = phonemes return data
def ipa_phonemize(text, lang="en-us", use_g2p=False): if use_g2p: assert lang == "en-us", "g2pE phonemizer only works for en-us" try: from g2p_en import G2p g2p = G2p() return " ".join("|" if p == " " else p for p in g2p(text)) except ImportError: raise ImportError("Please install phonemizer: pip install g2p_en") else: try: from phonemizer import phonemize from phonemizer.separator import Separator return phonemize(text, backend='espeak', language=lang, separator=Separator(word="| ", phone=" ")) except ImportError: raise ImportError( "Please install phonemizer: pip install phonemizer")
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str: requires_backends(self, "phonemizer") from phonemizer import phonemize from phonemizer.separator import Separator word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else "" phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="") phonemes = phonemize( text, language=phonemizer_lang, backend=self.phonemizer_backend, separator=separator, language_switch="remove-flags", ) phonemes = phonemes.strip() return phonemes
def phonemize( cls, text: str, lang: Optional[str], phonemizer: Optional[str] = None, preserve_punct: bool = False, to_simplified_zh: bool = False, ): if to_simplified_zh: import hanziconv text = hanziconv.HanziConv.toSimplified(text) if phonemizer == "g2p": import g2p_en g2p = g2p_en.G2p() if preserve_punct: return " ".join("|" if p == " " else p for p in g2p(text)) else: res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)] return " ".join(p for p in res if p.isalnum()) if phonemizer == "g2pc": import g2pc g2p = g2pc.G2pC() return " ".join([w[3] for w in g2p(text)]) elif phonemizer == "ipa": assert lang is not None import phonemizer from phonemizer.separator import Separator lang_map = {"en": "en-us", "fr": "fr-fr"} return phonemizer.phonemize( text, backend="espeak", language=lang_map.get(lang, lang), separator=Separator(word="| ", phone=" "), ) else: return text
def phonemize(lang, files): print("Phonemize") validated_set = pandas.read_csv(f"{DB_IN}/{lang}/validated.tsv", sep="\t") paths = list(validated_set["path"]) sentences = list(validated_set["sentence"]) validated_dict = {paths[i][:-4]: sentences[i] for i in range(len(paths))} phonemes_to_id = {} annotations = [] for i, file in enumerate(files): sentence = validated_dict[file] phonemes = phonemizer.phonemize(sentence, language=lang, backend="espeak") out = [] for char in phonemes: if char == " ": continue try: out.append(str(phonemes_to_id[char])) except KeyError: phonemes_to_id[char] = len(phonemes_to_id) out.append(str(phonemes_to_id[char])) line = file + " " + " ".join(out) annotations.append(line) print(f"\r{i+1}/{len(files)} ....", end="", flush=True) print(f"\r \r", end="", flush=True) with open(f"{DB_OUT}/{lang}/annotations.txt", "w") as f: # Annotation file f.write("\n".join(annotations)) with open(f"{DB_OUT}/{lang}/phonemes_to_id.json", "w") as f: # Phonemes to id json.dump(phonemes_to_id, f) return annotations
def collate_fn_common(batch, data_type, max_len_mel=2000, reconstructed_phoneme=False): # ggg, truefalse = batch[0] batch_size = len(batch) parts = 6 final_list = [] if truefalse: seq_len = len(ggg[1]) del ggg example_id = [] for n in range(parts): final_list.append([None] * (batch_size * seq_len)) # for i in range(batch_size): part, truefalse = batch[i] for j in range(parts): final_list[j][i * seq_len:(i + 1) * seq_len] = part[j] example_id.append(i * seq_len) # else: del ggg for n in range(parts): final_list.append([None] * (batch_size)) # for i in range(batch_size): part, truefalse = batch[i] waveform, sample_rate, client_id, sentence = part part = [ waveform, waveform.shape[1], sample_rate, client_id, sentence, len(sentence) ] for j in range(parts): final_list[j][i] = part[j] example_id = None # waveforms, waveform_l, sample_rates, client_ids, sentences, sentences_l = final_list # if not reconstructed_phoneme: sentences = phonemize(sentences, backend='espeak', with_stress=False, separator=separator.Separator(phone=' ', syllable='', word='- ')) for i in range(len(sentences_l)): sentences_l[i] = len(sentences[i]) biggest_l_index = sentences_l.index(max(sentences_l)) token = tokenizer.tokenize(sentences[biggest_l_index]) text_field = TextField(token, token_indexer) text_field.index(vocab) padding_lengths = text_field.get_padding_lengths() list_tokens = [] mel_list = [None] * len(sample_rates) mel_list_l = [None] * len(sample_rates) # for i in range(len(sentences_l)): token = tokenizer.tokenize(sentences[i]) text_field = TextField(token, token_indexer) text_field.index(vocab) tensor_dict = text_field.as_tensor(padding_lengths) list_tokens.append(tensor_dict) # if data_type == "train": mel_list[i] = train_audio_transforms( waveforms[i]).squeeze(0).transpose(0, 1) else: mel_list[i] = test_audio_transforms( waveforms[i]).squeeze(0).transpose(0, 1) mel_list_l[i] = mel_list[i].shape[0] waveforms[i] = waveforms[i].squeeze(0) # waveforms = nn.utils.rnn.pad_sequence(waveforms, batch_first=True).unsqueeze(1) # mel_list.append(torch.zeros((max_len_mel, mel_list[0].shape[1]))) spectrograms = nn.utils.rnn.pad_sequence( mel_list, batch_first=True).unsqueeze(1).transpose(2, 3) spectrograms = spectrograms[1:] highest_mel_l = spectrograms[0].shape[2] mel_mask = create_mask_pad(highest_mel_l, mel_list_l) # text_field_tensors = text_field.batch_tensors(list_tokens) # sentences_tensor = nn_util.get_token_ids_from_text_field_tensors( text_field_tensors) sentences_mask = nn_util.get_text_field_mask(text_field_tensors) == False return sentences_tensor, sentences_mask, spectrograms, mel_mask, waveforms, waveform_l, client_ids, example_id
def get_phonetic(w): return phonemize(w,backend="espeak")
def test_relative(): from phonemizer import phonemize assert phonemize('a') == 'eɪ '
def test_absolute(): from phonemizer.phonemize import phonemize assert phonemize('a') == 'eɪ '
def _test(text): return phonemize(text, language='en-us', backend='festival', strip=True, separator=separator.Separator(' ', '|', '-'))
def string_to_tensor(self, text, view=False, return_string=False): """ Fixes unicode errors, expands some abbreviations, turns graphemes into phonemes and then vectorizes the sequence either as IDs to be fed into an embedding layer, or as an articulatory matrix. """ # clean unicode errors, expand abbreviations utt = clean(text, fix_unicode=True, to_ascii=False, lower=False, lang=self.clean_lang) self.expand_abbrevations(utt) # phonemize with code switching if self.use_codeswitching: cs_dicts = self.lid.identify(utt) chunks = [] for i in range(len(cs_dicts)): word = cs_dicts[i]['word'] cs_lang = cs_dicts[i]['entity'] # print(word, "\t", cs_lang) if cs_lang == 'spa' or cs_lang == 'other': g2p_lang = 'es' elif cs_lang == 'en': g2p_lang = 'en-us' elif cs_lang == 'ne': if word in self.en_cities: g2p_lang = 'en-us' else: g2p_lang = 'es' else: g2p_lang = 'es' if i == 0: current_lang = g2p_lang current_chunk = word continue if word.startswith('##') or word.startswith( "'") or word == "s": g2p_lang = current_lang # wordpieces of one word should all have the same language if g2p_lang == current_lang: current_chunk += " " + word else: chunks.append({ 'word': current_chunk, 'lang': current_lang }) current_chunk = word current_lang = g2p_lang chunks.append({'word': current_chunk, 'lang': current_lang}) chunks = self.postprocess_codeswitch(chunks) # phonemize chunks phones_chunks = [] for chunk in chunks: # chunk = self.postprocess_codeswitch_simple(chunk) # uncomment this line if postprocessing doesn't work seq = chunk['word'] g2p_lang = chunk['lang'] # print('seq: ', seq, '\t', g2p_lang) phones_chunk = phonemizer.phonemize(seq, language_switch='remove-flags', backend="espeak", language=g2p_lang, preserve_punctuation=True, strip=True, punctuation_marks=';:,.!?¡¿—…"«»“”~/', with_stress=self.use_stress).replace(";", ",") \ .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \ .replace("\t", " ").replace("/", " ").replace("¡", "").replace("¿", "").replace(",", "~") if g2p_lang == 'en-us': phones_chunk = self.map_phones(phones_chunk) if len(phones_chunk.split()) > 4: phones_chunks.append("~" + phones_chunk + "~") else: phones_chunks.append(phones_chunk) phones = ' '.join(phones_chunks) phones = phones.replace(" ~", "~").replace(" .", ".").replace( " !", "!").replace(" ?", "?").lstrip() phones = re.sub("~+", "~", phones) else: # just phonemize without code switching phones = phonemizer.phonemize(utt, language_switch='remove-flags', backend="espeak", language=self.g2p_lang, preserve_punctuation=True, strip=True, punctuation_marks=';:,.!?¡¿—…"«»“”~/', with_stress=self.use_stress).replace(";", ",") \ .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \ .replace("\t", " ").replace("/", " ").replace("¡", "").replace("¿", "").replace(",", "~") phones = re.sub("~+", "~", phones) if not self.use_prosody: # retain ~ as heuristic pause marker, even though all other symbols are removed with this option. # also retain . ? and ! since they can be indicators for the stop token phones = phones.replace("ˌ", "").replace("ː", "").replace( "ˑ", "").replace("˘", "").replace("|", "").replace("‖", "") if not self.use_word_boundaries: phones = phones.replace(" ", "") else: phones = re.sub(r"\s+", " ", phones) phones = "+" + phones # I have no idea how this happened, but the synthesis just cannot pronounce ɔ. # Seems like it did not occur in the training data, maybe aligner removed it? As hacky fix, use o instead. phones = phones.replace("ɔ", "o") + "~" # phones = self.map_phones(phones) if view: print("Phonemes: \n{}\n".format(phones)) phones_vector = list() # turn into numeric vectors for char in phones: if self.allow_unknown: phones_vector.append( self.ipa_to_vector.get(char, self.default_vector)) else: if char in self.ipa_to_vector.keys(): phones_vector.append(self.ipa_to_vector[char]) if self.use_explicit_eos: phones_vector.append(self.ipa_to_vector["end_of_input"]) # combine tensors and return if not return_string: return torch.LongTensor(phones_vector).unsqueeze(0) else: return phones + "#"
def text2phoneme(self, batch): """Convert text to phoneme.""" batch["sentence"] = phonemize(batch["sentence"], language=self.language, backend="espeak") return batch
def main(): """Phonemize a text from command-line arguments""" args = parse_args() # setup a custom path to espeak and festival if required (this must be done # before generating the version message) if args.espeak_path: EspeakBackend.set_espeak_path(args.espeak_path) if args.festival_path: FestivalBackend.set_festival_path(args.festival_path) # display version information and exit if args.version: print(version.version()) return # list supported languages and exit if args.list_languages: backends = (['festival', 'segments', 'espeak', 'espeak-mbrola'] if not args.backend else [args.backend]) for backend in backends: print(f'supported languages for {backend} are:\n' + '\n'.join(f'\t{k}\t->\t{v}' for k, v in sorted( BACKENDS_MAP[backend].supported_languages().items()))) return # set default backend as espeak if not specified args.backend = args.backend or 'espeak' # configure logging according to --verbose/--quiet options verbosity = 'normal' if args.verbose: verbosity = 'verbose' elif args.quiet: verbosity = 'quiet' log = logger.get_logger(verbosity=verbosity) # configure input as a readable stream streamin = args.input if isinstance(streamin, str): streamin = codecs.open(streamin, 'r', encoding='utf8') log.debug('reading from %s', streamin.name) # configure output as a writable stream streamout = args.output if isinstance(streamout, str): streamout = codecs.open(streamout, 'w', 'utf8') log.debug('writing to %s', streamout.name) # configure the separator for phonemes, syllables and words. if args.backend == 'espeak-mbrola': log.debug('using espeak-mbrola backend: ignoring word separator') sep = separator.Separator(phone=args.phone_separator, syllable=None, word=None) else: sep = separator.Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) log.debug('separator is %s', sep) text = [line.strip() for line in streamin] # phonemize the input text out = phonemize(text, language=args.language, backend=args.backend, separator=sep, strip=args.strip, preserve_punctuation=args.preserve_punctuation, punctuation_marks=args.punctuation_marks, with_stress=args.with_stress, language_switch=args.language_switch, njobs=args.njobs, logger=log) if out: streamout.write('\n'.join(out) + '\n')
def test_bad(): with pytest.raises(RuntimeError): phonemize('', words_mismatch='foo') with pytest.raises(RuntimeError): phonemize('', backend='festival', words_mismatch='remove')
def main(): """Phonemize a text from command-line arguments""" args = parse_args() # setup a custom path to espeak and festival if required (this must be done # before generating the version message) if args.espeak_library: BACKENDS['espeak'].set_library(args.espeak_library) if args.festival_executable: BACKENDS['festival'].set_executable(args.festival_executable) # display version information and exit if args.version: print(version.version()) return # list supported languages and exit if args.list_languages: print(list_languages(args.backend)) return # set default backend as espeak if not specified args.backend = args.backend or 'espeak' # configure logging according to --verbose/--quiet options log = get_logger(args.verbose, args.quiet) # configure input:output as a readable/writable streams streamin = setup_stream(args.input, 'r') log.debug('reading from %s', streamin.name) streamout = setup_stream(args.output, 'w') log.debug('writing to %s', streamout.name) # configure the separator for phonemes, syllables and words. if args.backend == 'espeak-mbrola': log.debug('using espeak-mbrola backend: ignoring word separator') sep = separator.Separator(phone=args.phone_separator, syllable=None, word=None) else: sep = separator.Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) log.debug('separator is %s', sep) if args.prepend_text: input_output_separator = sep.input_output_separator(args.prepend_text) log.debug('prepend input text to output, separator is "%s"', input_output_separator) else: input_output_separator = False # phonemize the input text out = phonemize(streamin.readlines(), language=args.language, backend=args.backend, separator=sep, strip=args.strip, prepend_text=args.prepend_text, preserve_punctuation=args.preserve_punctuation, punctuation_marks=args.punctuation_marks, with_stress=args.with_stress, tie=args.tie, language_switch=args.language_switch, words_mismatch=args.words_mismatch, njobs=args.njobs, logger=log) if out and input_output_separator: streamout.write( os.linesep.join(f'{line[0]} {input_output_separator} {line[1]}' for line in out) + os.linesep) elif out: streamout.write(os.linesep.join(out) + os.linesep)
from phonemizer import phonemize ret = phonemize('English', language='en-us') print(ret)
def text2phoneme(text): text = phonemizer.phonemize(text, seperator=char_sep) return text