def SpeakLongText(long_text, max_text_length=GOOGLE_MAX_TEXT_LENGTH): "Converts a full length long_text text into an mp3" # Split the long_text into short_texts small enough to TTS long_text_as_short_texts = SplitTextToShortTexts(long_text, max_text_length) # Allocate a temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Get the event loop loop = asyncio.get_event_loop() concurrency_limit = asyncio.Semaphore( MAX_CONCURRENT_GOOGLE_API_REQUESTS) # NOTE: Google's text to speech library creates a TCP connection for each request but does not close it. # These even stay open in the background after the Client is de-referenced (?!). # These each use a File Descriptor, so for a large book, we hit the max file descriptors limit and crash. # Running each TTS in its own proccess guarantees that at least at the end of the chapter, all will be de-allocated. # Manually create an executor so we can force it to clean up after with concurrent.futures.ProcessPoolExecutor( max_workers=MAX_CONCURRENT_GOOGLE_API_REQUESTS) as executor: # Call to spawn a thread to generate each short text async def GenerateShortTextInThread(loop, short_text, temp_dir): async with concurrency_limit: return await loop.run_in_executor(executor, SpeakShortText, short_text, temp_dir) # Call to generate MP3s for all the short texts (concurrently) async def SimultaneouslyGenerateSeveralShortTexts( loop, all_short_texts, temp_dir): mp3_generation_tasks = [ GenerateShortTextInThread(loop, short_text, temp_dir) for short_text in all_short_texts ] return await asyncio.gather(*mp3_generation_tasks) # Generate an MP3 for each short_text mp3s_of_short_texts = loop.run_until_complete( SimultaneouslyGenerateSeveralShortTexts( loop, long_text_as_short_texts, temp_dir)) # Attempt to clean up all resources executor.shutdown(wait=True) # Combine the short_texts into a single mp3 mp3_long_text = Sine(300).to_audio_segment(duration=500) for mp3_short_text in mp3s_of_short_texts: mp3_long_text = mp3_long_text.append( AudioSegment.from_mp3(mp3_short_text)) # Return the full Mp3 (as a temporary file) temporary_mp3 = tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) mp3_long_text.export(temporary_mp3.name, format="mp3") return temporary_mp3
def text_to_audio(text, file_name, export_file_format, # e.g. "ogg" codec=None, # e.g. "opus" frequency=700, wpm=10, cross_fade=2): unit_length_seconds = wpm_to_unit_length_seconds(wpm) intervals = sentence_to_intervals(text) segment = Sine(0).to_audio_segment(cross_fade) # silence at the beginning for cross-fade for interval in intervals: segment = segment.append(interval_to_wave_data_segment(interval, frequency, unit_length_seconds), crossfade=cross_fade) segment.export(file_name, format=export_file_format, codec=codec)
def generate_notes(current_label): """generate notes based on full duration """ for index, row in labels_df.iterrows(): audio = row['1_filename.csv'] path = 'mean_' + current_label mean_location = [f for f in os.listdir(path) if f.endswith('csv')] for filename in mean_location: y, sr = librosa.load(data_dir + '/' + audio) audio_duration = librosa.get_duration(y=y, sr=sr) current_composer = pd.read_csv(path + '/' + filename) row_number = 0 transpose_value = len(current_composer) chroma_points = transpose_value sample_duration = audio_duration / chroma_points samples_ms = sample_duration * 1000 if not os.path.exists('segments_of_notes_' + current_label): os.makedirs('segments_of_notes_' + current_label) if not os.path.exists('segments_of_notes_' + current_label + '/' + filename[:-4]): os.makedirs('segments_of_notes_' + current_label + '/' + filename[:-4]) for index, row in current_composer.iterrows(): row_number = row_number + 1 row_numb = str(row_number) row_numb = row_numb.zfill(2) C = row['C'] Csh = row['Csh'] D = row['D'] Dsh = row['Dsh'] E = row['E'] F = row['F'] Fsh = row['Fsh'] G = row['G'] Gsh = row['Gsh'] A = row['A'] Ash = row['Ash'] B = row['B'] if emo_choice == 'low': # C2 Ctone = Sine(65.41).to_audio_segment(duration=samples_ms) Cshtone = Sine(69.30).to_audio_segment(duration=samples_ms) Dtone = Sine(73.42).to_audio_segment(duration=samples_ms) Dshtone = Sine(77.78).to_audio_segment(duration=samples_ms) Etone = Sine(82.41).to_audio_segment(duration=samples_ms) Ftone = Sine(87.31).to_audio_segment(duration=samples_ms) Fshtone = Sine(92.50).to_audio_segment(duration=samples_ms) Gtone = Sine(98.00).to_audio_segment(duration=samples_ms) Gshtone = Sine(103.83).to_audio_segment( duration=samples_ms) Atone = Sine(110.00).to_audio_segment(duration=samples_ms) Ashtone = Sine(116.54).to_audio_segment( duration=samples_ms) Btone = Sine(123.47).to_audio_segment(duration=samples_ms) if emo_choice == 'high': # C3 Ctone = Sine(130.81).to_audio_segment(duration=samples_ms) Cshtone = Sine(138.59).to_audio_segment( duration=samples_ms) Dtone = Sine(146.83).to_audio_segment(duration=samples_ms) Dshtone = Sine(155.56).to_audio_segment( duration=samples_ms) Etone = Sine(164.81).to_audio_segment(duration=samples_ms) Ftone = Sine(174.61).to_audio_segment(duration=samples_ms) Fshtone = Sine(185.00).to_audio_segment( duration=samples_ms) Gtone = Sine(196.00).to_audio_segment(duration=samples_ms) Gshtone = Sine(207.65).to_audio_segment( duration=samples_ms) Atone = Sine(220.00).to_audio_segment(duration=samples_ms) Ashtone = Sine(233.08).to_audio_segment( duration=samples_ms) Btone = Sine(246.94).to_audio_segment(duration=samples_ms) volume_reduct = 50 Cvolume = np.mean(C) Cvolume = np.negative(Cvolume) - volume_reduct Ctone = Ctone + Cvolume Cshvolume = np.mean(Csh) Cshvolume = np.negative(Cshvolume) - volume_reduct Cshtone = Cshtone + Cshvolume Dvolume = np.mean(D) Dvolume = np.negative(Dvolume) - volume_reduct Dtone = Dtone + Dvolume Dshvolume = np.mean(Dsh) Dshvolume = np.negative(Dshvolume) - volume_reduct Dshtone = Dshtone + Dshvolume Evolume = np.mean(E) Evolume = np.negative(Evolume) - volume_reduct Etone = Etone + Evolume Fvolume = np.mean(F) Fvolume = np.negative(Fvolume) - volume_reduct Ftone = Ftone + Fvolume Fshvolume = np.mean(Fsh) Fshvolume = np.negative(Fshvolume) - volume_reduct Fshtone = Fshtone + Fshvolume Gvolume = np.mean(G) Gvolume = np.negative(Gvolume) - volume_reduct Gtone = Gtone + Gvolume Gshvolume = np.mean(Gsh) Gshvolume = np.negative(Gshvolume) - volume_reduct Gshtone = Gshtone + Gshvolume Avolume = np.mean(A) Avolume = np.negative(Avolume) - volume_reduct Atone = Atone + Avolume Ashvolume = np.mean(Ash) Ashvolume = np.negative(Ashvolume) - volume_reduct Ashtone = Ashtone + Ashvolume Bvolume = np.mean(B) Bvolume = np.negative(Bvolume) - volume_reduct Btone = Btone + Bvolume Ctone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_C.wav', format="wav") Cshtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_Csh.wav', format="wav") Dtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_D.wav', format="wav") Dshtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_Dsh.wav', format="wav") Etone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_E.wav', format="wav") Ftone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_F.wav', format="wav") Fshtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_Fsh.wav', format="wav") Gtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_G.wav', format="wav") Gshtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_Gsh.wav', format="wav") Atone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_A.wav', format="wav") Ashtone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_Ash.wav', format="wav") Btone.export('segments_of_notes_' + current_label + '/' + filename[:-4] + '/' + row_numb + '_B.wav', format="wav") print('segmented audio generated for : ' + filename)