def read_mp3(filepath, plot=False, nperseg=5000, noverlap=0): import audio2numpy as _a2n filepath = 'C:\\Users\\jwbrooks\\Downloads\\2021_03_25_20_56_34.mp3' audio, f_s = _a2n.open_audio(filepath) t = _np.arange(len(audio)) / f_s if len(audio.shape) == 1: audio = _xr.DataArray(audio, dims='t', coords=[t]) else: audio = _xr.DataArray(audio[:, 0], dims='t', coords=[t]) if plot == True: fig, ax = _plt.subplots() audio.plot(ax=ax) stft_results = stft(audio, numberSamplesPerSegment=nperseg, numberSamplesToOverlap=noverlap, plot=False, logScale=True) fig, ax = _plt.subplots() _np.abs(stft_results).plot(ax=ax) return audio
def read(self, path: str, depth = LOG_NO_DEPTH) -> None: debug_prefix = "[AudioFile.read]" ndepth = depth + LOG_NEXT_DEPTH logging.info(f"{depth}{debug_prefix} Reading stereo audio in path [{path}], trying soundfile") try: self.stereo_data, self.sample_rate = soundfile.read(path) except RuntimeError: logging.warn(f"{depth}{debug_prefix} Couldn't read file with soundfile, trying audio2numpy..") self.stereo_data, self.sample_rate = audio2numpy.open_audio(path) # We need to transpose to a (2, -1) array logging.info(f"{depth}{debug_prefix} Transposing audio data") self.stereo_data = self.stereo_data.T # Calculate the duration and see how much channels this audio file have self.duration = self.stereo_data.shape[1] / self.sample_rate self.channels = self.stereo_data.shape[0] # Log few info on the audio file logging.info(f"{depth}{debug_prefix} Duration of the audio file = [{self.duration:.2f}s]") logging.info(f"{depth}{debug_prefix} Audio sample rate is [{self.sample_rate}]") logging.info(f"{depth}{debug_prefix} Audio data shape is [{self.stereo_data.shape}]") logging.info(f"{depth}{debug_prefix} Audio have [{self.channels}]") # Get the mono data of the audio logging.info(f"{depth}{debug_prefix} Calculating mono audio") self.mono_data = (self.stereo_data[0] + self.stereo_data[1]) / 2 # Just make sure the mono data is right.. logging.info(f"{depth}{debug_prefix} Mono data shape: [{self.mono_data.shape}]")
def run(self): while self._keep_listening: media = self.media_queue.get() if media == 'SHUTDOWN': self._keep_listening = False break if len(media) < 2: continue if media[0] == 'audio': kwargs = {} if len(media) >= 3: kwargs = media[2] device = kwargs.get('device', None) volume = kwargs.get('volume', 100) try: data, fs = audio2numpy.open_audio(media[1]) except AudioFormatError: self.buffer_queue.put(('ERR', 'Invalid File Format:')) self.buffer_queue.put(('ERR', media[1])) self.buffer_queue.put(('ERR', 'Accepted file formats: .wav .mp3')) continue if volume != 100 and type(volume) == int: factor = volume / 100 multiplier = pow(2, (sqrt(sqrt(sqrt(factor))) * 192 - 192)/6) numpy.multiply(data, multiplier, out=data, casting='unsafe') sounddevice.play(data, fs, device=device) sounddevice.wait()
def get_audio_list(path_dataset, bucket_name): audio_list = [] local_files_paths = download_audio_files(path_dataset, bucket_name) for fp in local_files_paths: signal, sampling_rate = open_audio(fp) audio_list.append((signal, sampling_rate)) return audio_list
def ibm_recog(self, audioname, audiofp): authenticator = IAMAuthenticator( '6noBhxJHkbRVsgbxsl47v6dFZnJdoRRrDRYte7GgKKxu') speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url( 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/51085e72-7959-4c18-94cd-d4d874baf61d' ) myRecognizeCallback = MyRecognizeCallback() ts = [] c = [] with open(join(dirname(audioname), audiofp), 'rb') as audio_file: audio_source = AudioSource(audio_file) x = speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/mp3', inactivity_timeout=-1, recognize_callback=myRecognizeCallback, model='en-US_BroadbandModel', timestamps=True, smart_formatting=True, ) for r in result: alternatives = r.get('alternatives') ts.append(alternatives[0].get('timestamps')) timestamps = [elem for twod in ts for elem in twod] c.append(alternatives[0].get('confidence')) confidence = sum(c) / len(c) a, sr = open_audio(audiofp) self.initAudio(a, sr) self.setupIBM(timestamps, confidence) self.audiofp = audiofp
def get_talking(time): '''Returns a given number of seconds of talking audio. Audio is 0-centered and has max amplitude of 0.5''' fp = __file__ + '/../test_talking.wav' song_signal, sampling_rate = open_audio(fp) trunc_signal = song_signal[:int(sampling_rate * time)] return (scale_to_one(trunc_signal))
def playSoundData(filename: str = "hello.mp3"): t = threading.Thread(target=playsound, args=(filename, )) t.start() ##### data, samplerate = open_audio(filename) sd.play(data, samplerate) status = sd.wait() ##### t.join()
def get_song(time): fp = __file__ + '/../test_song_3.wav' song_signal, sampling_rate = open_audio(fp) trunc_signal = song_signal[1950000:1950000 + int(sampling_rate * time)] return flatten(trunc_signal)
def read_file(filename, directory): filepath = os.path.join(directory, filename) audio, sampling_rate = open_audio(filepath) if len(audio.shape) > 1: audio = audio[:, 1:] + audio[:, :1] audio = [item for sublist in audio for item in sublist] # sd.play(audio,sampling_rate) # time.sleep(10) return audio, sampling_rate
def shortenPause(self, trans, pauseOverlap, RenderSettings): shorty = AudioSegment.empty() i = 0 iend = len(pauseOverlap) mspause = RenderSettings.pauseShortenAmount * 1000 fp1 = trans[0].audiofp fp2 = trans[1].audiofp s1 = AudioSegment.from_file(fp1, format='mp3') s2 = AudioSegment.from_file(fp2, format='mp3') tot = s1.overlay(s2) for i in range(iend - 1): msstart = 1000 * pauseOverlap[i + 1][0] msend = 1000 * pauseOverlap[i + 1][1] pmsstart = 1000 * pauseOverlap[i][0] pmsend = 1000 * pauseOverlap[i][1] if (i == 0 and pmsstart != 0): shorty += tot[:pmsstart] #print('Added Audio:0','-',pmsstart) if (pmsend - pmsstart > mspause): shorty += tot[pmsstart:pmsstart + mspause] #print('Shortened Pause:',pmsstart,'-',pmsstart+mspause) elif (pmsend - pmsstart < mspause): shorty += tot[pmsstart:pmsend] #print('Kept Pause:',pmsstart,'-',pmsend) shorty += tot[pmsend:msstart] #print('Added Audio:',pmsend,'-',msstart) shorty += tot[msend:] #print('Added Audio:',msend,'-','end') shorty.export('RawAudio/PauseShort.mp3', format='mp3') a, sr = open_audio('RawAudio/PauseShort.mp3') self.initAudio(a, sr)
def profanityFilter(self, trans, Renderettings): cens = 'RawAudio/timcensor.mp3' emp = AudioSegment.empty() fp1 = trans[0].audiofp fp2 = trans[1].audiofp s1 = AudioSegment.from_file(fp1, format='mp3') s2 = AudioSegment.from_file(fp2, format='mp3') prof = s1.overlay(s2) bleep = AudioSegment.from_file(cens, format='mp3') badlist = [] self.MainFromOthers(trans) lng = len(self.words) for i in range(lng): if (self.words[i] == '****'): badlist.append(self.timestamps[i]) fstart = (badlist[0][0] * 1000) fl = (badlist[0][1] - badlist[0][0]) * 1000 emp += prof[:fstart] emp += bleep[:fl] blng = len(badlist) for i in range(blng - 1): start = (badlist[i][1] * 1000) end = (badlist[i + 1][0] * 1000) ended = (badlist[i + 1][1] * 1000) emp += prof[start:end] l = (badlist[i + 1][1] - badlist[i + 1][0]) * 1000 if (l > 500): l = 500 emp += bleep[:l] emp += prof[ended:] emp.export('RawAudio/cleantest.mp3', format='mp3') a, sr = open_audio('RawAudio/cleantest.mp3') self.initAudio(a, sr)
def __getitem__(self, idx): hop_length = 1024 # open audio file_path = self.data[idx] signal, sampling_rate = open_audio(file_path) if len(signal.shape) > 1: signal = np.mean(signal, axis = 1) if sampling_rate != 44100: signal = librosa.resample(signal, sampling_rate, 44100) sampling_rate = 44100 # get 30 second chunk len_index_30_sec = int(30 / (1 / sampling_rate)) # trim first and last 30 seconds signal = signal[len_index_30_sec:-len_index_30_sec] # random start index start_index = np.random.randint(low = 0, high = len(signal) - len_index_30_sec) signal = signal[start_index:start_index + len_index_30_sec] # if training change pitch randomly if self.train: n_steps = np.random.randint(low = -4, high=4) signal = librosa.effects.pitch_shift(signal, sampling_rate, n_steps=n_steps) # extract harmonic data_h = librosa.effects.harmonic(signal) # cqt transform S = np.real(librosa.cqt(data_h, sr=sampling_rate, hop_length=hop_length)).astype(np.float32) d = torch.from_numpy(np.expand_dims(S, axis = 0)).type(torch.FloatTensor) # normalize d = F.normalize(d) l = torch.from_numpy(np.array(self.labels[idx])).type(torch.LongTensor) # print(d.shape, sampling_rate, file_path) return d,l
def resample_and_save_datasets(path_dataset, bucket_name, files_format, dimension_start, folder_start, song_start, fragment_start): dimensiones_progresivas = [(4, 750), (8, 1500), (16, 3000), (32, 6000), (64, 12000), (128, 24000), (256, 48000)] dimensiones_progresivas = dimensiones_progresivas[ dimension_start:len(dimensiones_progresivas)] for dimension in dimensiones_progresivas: for folder in range(folder_start, 9): cant_fragmentos = fragment_start directory = "local_ds/" + files_format + "/original/" + str( folder + 1) + "/" lista_canciones = os.listdir(directory) lista_canciones = lista_canciones[song_start:len(lista_canciones)] for song_dirname in lista_canciones: print("Preparando canción...: " + directory + song_dirname) try: signal, sampling_rate = open_audio(directory + song_dirname) list_resampled_songs = resample_song( dimension, signal, sampling_rate) for i in range(len(list_resampled_songs)): signal = list_resampled_songs[i] #guardar en mp3 local_path = "local_ds/mp3/" + str( dimension[0]) + "-" + str( dimension[1]) + "/" + str( folder + 1) + "/" + str(cant_fragmentos) + ".mp3" path_upload = path_dataset + "mp3/" + str( dimension[0]) + "-" + str( dimension[1]) + "/" + str( folder + 1) + "/" + str(cant_fragmentos) + ".mp3" folder_name = os.path.dirname(local_path) if not os.path.exists(folder_name): os.makedirs(folder_name) write(local_path, dimension[1], signal) upload_blob(bucket_name, local_path, path_upload) #guardar en wav local_path = "local_ds/wav/" + str( dimension[0]) + "-" + str( dimension[1]) + "/" + str( folder + 1) + "/" + str(cant_fragmentos) + ".wav" path_upload = path_dataset + "wav/" + str( dimension[0]) + "-" + str( dimension[1]) + "/" + str( folder + 1) + "/" + str(cant_fragmentos) + ".wav" folder_name = os.path.dirname(local_path) if not os.path.exists(folder_name): os.makedirs(folder_name) write(local_path, dimension[1], signal) upload_blob(bucket_name, local_path, path_upload) cant_fragmentos += 1 except: pass #restablecer para la siguiente carpeta song_start = 0 #restablecer para la siguiente dimension folder_start = 0 fragment_start = 1
def dalek_voice(inp): signal, sampling_rate = open_audio(inp) prc = mk_mid(signal, sampling_rate) voice = ring_mod(prc, sampling_rate, 30) sa.play_buffer(voice, 1, 2, sampling_rate)
def load_mp3(fname: str) -> np.array: """ Loads a mp3 file as a numpy array. """ data, sampling_rate = open_audio(fname) assert sampling_rate == FS or sampling_rate == ORIG return data
def test_aiff(self): fp = "./examples/chord.aif" s, sr = open_audio(fp) self.assertEqual(128000, s.shape[0]) self.assertEqual(32000, sr)
def test_wav(self): fp = "./examples/word.wav" s, sr = open_audio(fp) self.assertEqual(24000, sr) self.assertEqual(16128, s.shape[0])
def read(filename, normalized=False): signal, sampling_rate = open_audio(filename) return signal, sampling_rate
import math #terminal color import platform if(platform.system() == "Linux"): #add colors to Linux terminal prefix = "\033[36m" suffix = "\033[39m" else: prefix = "" suffix = "" fp = input(prefix + "Name of your input file: " + suffix) #fp = "inputs/sound.mp3" print(prefix + "Opening audio file ..." + suffix) signal, sampling_rate = open_audio(fp) output_file = input(prefix + "Name of output file: " + suffix) #output_file = "outputs/output.mp4" upscale = int(input(prefix + "Upscale factor (2x recommended for HD, 4x for 4k): " + suffix)) logo_img = input(prefix + "Logo image: " + suffix) print(prefix + "Opening logo file ..." + suffix) print(prefix + "Sampling rate of audio file: " + suffix + str(sampling_rate)) list = [] c = 0
import noisereduce as nr def plot_fft(f): n = len(f) fhat = np.fft.fft(f, n) print('done') dt = 1 / sr PSD = fhat * np.conj(fhat) / n freq = ((1 / dt) * n) * np.arange(n) L = np.arange(1, np.floor(n / 2), dtype='int') plt.plot(freq[L][::10], PSD[L][::10], alpha=0.1) fp = r'C:\Users\Dell\PycharmProjects\audiobook\org.mp3' # change to the correct path to your file accordingly x, sr = open_audio(fp) # sr, x = read_mp3(r'C:\Users\Dell\PycharmProjects\audiobook\org.mp3') # sr, x = read_mp3(r'C:\Users\Dell\PycharmProjects\audiobook\org.mp3') x_old = x.copy() version = 4.0 # thr1 = 1e8 thr1 = 'na' thr2 = int(278000 * 5) thr3 = 200 noise_indexes = [[3e5, 3.4e5], [182e5, 2e5], [6.1e5, 6.4e5], [1.41e6, 1.43e6], [1.78e6, 1.87e6], [2.02e6, 2.13e6], [2.7e6, 2.79e6], [2.99e6, 3.08e6], [3.14e6, 3.23e6]] x_noice = [] x_new = x[:, 0] for noise_tuple in noise_indexes: x_noice.extend(x_new[int(noise_tuple[0]):int(noise_tuple[1])])
import os LENGTH_TO_CONSTRUCT = 500000000 SPEECH_PATH = "en/clips" #Common Voice dataset, https://commonvoice.mozilla.org/en/datasets NOISE_PATH = "UrbanSound8k/audio/fold" #UrbanSound8k dataset, https://urbansounddataset.weebly.com/urbansound8k.html # print(len(os.listdir(SPEECH_PATH))) # print(len(os.listdir(NOISE_PATH))) length = 0 sounds_as_tensors = [] for file_name in os.listdir(SPEECH_PATH): if ".mp3" in file_name: data, rate = open_audio(SPEECH_PATH + "/" + file_name) data = librosa.resample(data, 48000, 22050) length += len(data) sound_as_tensor = torch.tensor(data) sounds_as_tensors.append(sound_as_tensor) if length >= LENGTH_TO_CONSTRUCT: break speech_tensor = torch.cat(sounds_as_tensors)[:LENGTH_TO_CONSTRUCT] torch.save(speech_tensor, "SPEECH.pt") print(speech_tensor.size()) length = 0 sounds_as_tensors = [] for fold in range(1, 11):
def speech_file_to_array_fn(batch): speech_array, sampling_rate = an.open_audio(batch["path"]) speech_array = torch.tensor(speech_array) batch["speech"] = resampler(sampling_rate, speech_array).squeeze() return batch
#!/usr/bin/env python # coding: utf-8 # In[2]: from audio2numpy import open_audio import numpy as np import matplotlib.pyplot as plt # In[3]: signal, signal_rate = open_audio("lisergic.mp3") # In[137]: orig = signal[:, 0] L = len(orig) orig_rms = np.sqrt(np.correlate(orig, orig)) noisies = [] for i in range(3): noisy = np.zeros(L) rnd = int(np.random.uniform()*L*0.01) print(rnd) noisy[0 : L-rnd] = orig[rnd : ] noisy[L - rnd : ] = orig[ : rnd] noisy += 1/orig_rms*np.random.uniform(size=len(orig))
title, artist = ("-".join(tokens[:-1]), tokens[-1].strip()) if len(tokens) > 1 else (title, None) lines.append((t, title.strip().replace("/", "-"), artist)) return lines if __name__ == "__main__": parser = argparse.ArgumentParser(description="ostlyser") parser.add_argument("-v", "--version", action="version", version="1.0.0") parser.add_argument("-a", "--audio", dest="audio", required=True, help="audio file to break up") parser.add_argument("-i", "--input", dest="file", required=True, help="read timing information from a file") parser.add_argument("-d", "--delimiter", dest="delim", default=":", help="delimiter to split input file on") args = parser.parse_args() data, sr = open_audio(args.audio) file = eyed3.load(args.audio) lines = parse_file(args, sr) for i, (start, name, artist) in enumerate(lines): # get slice of the original file that this song represents song = data[start: lines[i + 1][0] if i != len(lines) - 1 else len(data)] path = f"{i + 1}_{name}" save_mp3(path, song, sr) f = eyed3.load(path + ".mp3") # copy the tags of the original file f.tag = file.tag f.tag.title = name f.tag.artist = artist f.tag.track_num = i + 1
plt.plot(sample23, x23, ':b*') plt.title('Rational 2/3') #plt.subplots_adjust(wspace=0.35,hspace=0.9) plt.tight_layout() plt.show() # Sine Wave Manipulation #Extracting audio_file import ffmpeg from audio2numpy import open_audio fp = 'sin.wav' #fp = 'sin.wav' audio, sampling_rate = open_audio(fp) #mu, sampling_rate = ffmpeg.input(fp) sample_space = len(audio) #Plotting + Fourier Transform plt.figure() plt.style.use('seaborn') fourier_audio = np.absolute(np.fft.fft(audio)) omega_orginal = np.linspace(-2 * np.pi, 2 * np.pi, len(audio)) plt.subplot(2, 2, (1, 2)) plt.plot(omega_orginal, fourier_audio) plt.title('Original') #Down->Inter