def google_STT(self,audio): client = speech_v1.SpeechClient.from_service_account_json( '/data/second-conquest-293723-05738e995f8f.json') # Loads the audio into memory with io.open(audio, "rb") as audio_file: content = audio_file.read() audio = speech_v1.RecognitionAudio(content=content) encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED config = speech_v1.RecognitionConfig( encoding=encoding, sample_rate_hertz=22050, language_code="en-US", enable_automatic_punctuation=True, ) # Detects speech in the audio file start = time.time() response = client.recognize(request={"config": config, "audio": audio}) text = '' for result in response.results: text = text + result.alternatives[0].transcript text = english_cleaners(text) return text
def make_arpabet(text): # g2p functions g2p = G2p() # Define punctuation, prevent punctuation curlies, and make replacement dictionary to fix spacing punc = "!?,.;:#-_'\"()[]\n,." punc = list(punc) punc_key = list(punc) punc_alt = [" " + item for item in punc] punc_dict = {} for key in punc_alt: for value in punc_key: punc_dict[key] = value punc_key.remove(value) break # Text processing text = " ".join(g2p(english_cleaners(text))).split(" ") outlist = [] for item in text: item = item.strip() if item not in punc: item = "{" + item + "}" outlist.append(item) text = " ".join(outlist) for key, replacement in punc_dict.items(): text = text.replace(key, replacement) return text
def convert_to_ipa(texts): print("Converting training files to IPA notation...") epi = epitran.Epitran('eng-Latn', ligatures=True) for text_mel_pair in texts: text_mel_pair[1] = ipa.convert(english_cleaners(text_mel_pair[1])) foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1]) for word in foreign_words: text_mel_pair[1] = text_mel_pair[1].replace( word, epi.transliterate(word[0:len(word) - 1]))
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory): # Make synthesis paths if not os.path.exists(output_directory): os.makedirs(output_directory) print("Creating directory " + output_directory + "...") hparams = create_hparams() hparams.sampling_rate = 22050 print("Loading models...") model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) genlist = [] with open(text_file) as file: for line in file: genlist.append(line.strip()) for entry in genlist: wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav" epi = epitran.Epitran('eng-Latn', ligatures = True) if hparams.preprocessing == "ipa": entry = ipa.convert(english_cleaners(entry)) foreign_words = re.findall(r"[^ ]{0,}\*", entry) for word in foreign_words: entry = entry.replace(word, epi.transliterate(word[0:len(word)-1])) if hparams.preprocessing == "arpabet": entry = make_arpabet(entry) # Text sequencer if hparams.preprocessing is not None: sequence = np.array(text_to_sequence(entry, None))[None, :] else: sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # Synthesis mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_denoised = denoiser(audio, strength=0.01)[:, 0] # Save audio print ("Saving " + wav_name) write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
def get_first_words_idx(script_path): with open(script_path) as f: text = f.read() sents = sent_tokenize(text) print('sentences number: ', len(sents)) text = english_cleaners(text) words = text.split(' ') first_words_idx = [0] # 문장 마지막 단어 인덱스 구하기 for idx, word in enumerate(words): if word.endswith('.') or word.endswith('?') or word.endswith('!'): first_words_idx.append(idx + 1) return first_words_idx
def read_csv(path, fn_encoding='UTF8'): # reads csv file into audio snippet name and its transcript with open(path, encoding=fn_encoding) as f: data = [] temp = '' # for storing non-normalized for line in f: audio_name, audio_transcript, deprecated_1 = line.split('\t') replace_audio_name = audio_name.replace("/", "\\") audio_transcript = audio_transcript.strip() audio_transcript = audio_transcript.replace("\"", "").replace("(", "").replace(")", "")\ .replace("[", "").replace("]", "") audio_normalized_transcript = english_cleaners(audio_transcript) data.append( Data(replace_audio_name, audio_transcript, audio_normalized_transcript)) return data
def MFA_file_prep(self, hparams): ''' Will Create .lab files for all existing wav files in the hparams.wav_path folder. The .lab files will contain the cleaned or normalized text (expanded abbreviations, remove punctuations, expand number, to upper case) from csv_file The function will also check if the words in content are there in dict or not And will create a dictionary {alien_words: phoneme(alien_words)} using g2p_en input - hparams output - new_word_dictionary ''' filenames = [] content = [] update_words = {} with open(hparams.csv_path, encoding='utf-8') as f: for lines in f: filenames.append(lines.split("|")[0]) content.append(lines.split('|')[1]) words = self.load_words_from_dict(hparams) for i in range(0, len(filenames)): if os.path.exists(f'{hparams.lab_path}/{filenames[i]}.wav'): path = os.path.join(hparams.lab_path, filenames[i] + ".lab") clean_content = english_cleaners(content[i]) clean_content = punctuation_removers( clean_content) # add remove punctuations f = open(path, 'w+') f.write(clean_content.upper()) f.close() alien = set(clean_content.upper().split()) - set(words) alien_update = {i: (g2p(i)) for i in list(alien)} update_words = {**update_words, **alien_update} if update_words: print("update your dictionary using update_dict() of this class") else: print("No dictionary update required") return update_words #words to update
def test_cleaner_pipelines(): text = 'Mr. Müller ate 2 Apples' assert cleaners.english_cleaners(text) == 'mister muller ate two apples' assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples' assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'
def text_recognition(path, config): root, ext = os.path.splitext(path) txt_path = root + ".txt" if os.path.exists(txt_path): with open(txt_path) as f: out = json.loads(open(txt_path).read()) return out # if new api account is used, do resetting env file for google credential from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types out = {} error_count = 0 tmp_path = os.path.splitext(path)[0] + ".wav" client = speech.SpeechClient() # Fixed while True: try: # client= speech.SpeechClient() # Causes 10060 max retries exceeded -to OAuth -HK content = path[0] with io.open(tmp_path, 'rb') as f: audio = types.RecognitionAudio(content=f.read()) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-GB') response = client.recognize(config, audio) if len(response.results) > 0: alternatives = response.results[0].alternatives #results = 실제 음성인식된 text results = [ alternative.transcript for alternative in alternatives ] assert len(results) == 1, "More than 1 results: {}".format( results) #실질적으로 txt out = { os.path.basename(path): "" if len(results) == 0 else results[0], "normalized_text": english_cleaners(results[0]) } print(path, results[0], english_cleaners(results[0])) break break except Exception as err: raise Exception("OS error: {0}".format(err)) error_count += 1 print("Skip warning for {} for {} times".format(path, error_count)) if error_count > 5: break else: continue global removed if len(out) is 0: # remove file that only has instrument sound. os.remove(root + '.wav') print(root, '.wav file is removed!') else: with open(txt_path, 'w') as f: json.dump(out, f, indent=2, ensure_ascii=False) return out
def english_cleaner(): actual = cleaners.english_cleaners( 'I want to be there early on the day. Please organize') expected = 'I want to be there early on the day. Please organize' assert actual == expected