def on_epoch_end(self, epoch, logs={}): for _ in range(self.num_tests): x, y = self.random_sample() y_p = self.model.predict(x.reshape((1,x.shape[0],1))) x = np.squeeze(x) y = np.squeeze(y) y_p = np.squeeze(y_p) if self.difference_mask: y = x + y y_p = x + y_p print('x/y_p diff:') print(abs(np.sum(x) - np.sum(y_p))) print('x vs predicted y') plt.plot(x, color='red') plt.plot(y_p) plt.show() print('ground truth y vs predicted y') plt.plot(y, color='red') plt.plot(y_p) plt.show() if self.audio_preview: print('input sample:') ipd.display(ipd.Audio(x, rate=self.sr, autoplay=False)) print('ground truth:') ipd.display(ipd.Audio(y, rate=self.sr, autoplay=False)) print('prediction') ipd.display(ipd.Audio(y_p, rate=self.sr, autoplay=False)) return
def show_data(df, row): # Retrieve information from DF audio_data, sampling_rate, label = get_data_sample_rate_and_legend_from_df(df, row) # Print some stats and display the sound print(f"{label}({librosa.get_duration(audio_data, sr=sampling_rate)} sec)") ipd.display(ipd.Audio(audio_data, rate=sampling_rate)) print("\n") # Make plots X = librosa.stft(audio_data) Xdb = librosa.amplitude_to_db(abs(X)) plt.figure(figsize=(8, 16), dpi= 80, facecolor='w', edgecolor='k') plt.subplot(3, 1, 1) plt.title("Wave") librosa.display.waveplot(audio_data, sr=sampling_rate, x_axis="time") plt.subplot(3, 1, 2) plt.title("MEL") librosa.display.specshow(Xdb, sr=sampling_rate, x_axis="time", y_axis="mel") plt.subplot(3, 1, 3) plt.title("HZ") librosa.display.specshow(Xdb, sr=sampling_rate, x_axis="time", y_axis="hz") print("Audio") ipd.Audio(audio_data, rate = sampling_rate)
def style_transfer_v2(): audio_paths_ = 'data/examples_filelist_v2.txt' dataloader_ = TextMelLoader(audio_paths_, hparams) datacollate_ = TextMelCollate(1) ## Load data # for file_idx in range(10): # audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx] # print(dict(file_idx=file_idx, audio_path=audio_path, text=text)) file_idx = 8 audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx] print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid)) # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader_[file_idx][3][None].cuda() mel = load_mel(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]])) ipd.Audio(audio_path, rate=hparams.sampling_rate) # Style Transfer (Rhythm and Pitch Contour) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm)) plot_mel_f0_alignment(x[2].data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] t0 = time.time() # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) print(time.time() - t0) aukit.play_audio(out_wav, sr=22050) t0 = time.time() with torch.no_grad(): audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate) out_wav = audio[0].data.cpu().numpy() print(time.time() - t0) aukit.play_audio(out_wav, sr=22050)
def listen(signals, sr=None): if isinstance(signals, str): ipd.display(ipd.Audio(signals)) elif isinstance(signals, np.ndarray): ipd.display(ipd.Audio(signals, rate=sr)) elif isinstance(signals, list): for signal in signals: ipd.display(ipd.Audio(signal, rate=sr))
def play(self, with_clicks=False): if not with_clicks: return ipd.Audio(self.path, rate=self.sampling_rate) else: clicks = librosa.clicks(self.beat_times, sr=self.sampling_rate, length=len(self.waveform)) audio = self.waveform + clicks return ipd.Audio(audio, rate=self.sampling_rate)
def audio(S, hop_length=HOP_LENGTH, sr=SR): if len(S.shape) > 1: y = signal(S, hop_length) if y.size > 0: return ipd.display(ipd.Audio(y, rate=sr)) else: return ipd.display(ipd.Audio(np.zeros(hop_length * 2), rate=sr)) else: return ipd.display(ipd.Audio(S, rate=sr))
def resynthesize_sources_params(W, H, midi, signal, source_activations: List[numpy.ndarray]) -> None: for source_index, source in enumerate(source_activations): channel_H = source * H Y = numpy.dot(W, channel_H) * signal.X_phase print(f'Channel {source_index}:') reconstructed_signal = librosa.istft(Y, length=len(signal.x)) ipd.display(ipd.Audio(reconstructed_signal, rate=signal.sr)) mask = numpy.dot(W, channel_H) / (numpy.dot(W, H) + numpy.finfo(float).eps) Y2 = mask * signal.S print(f'Channel {source_index} (masked):') reconstructed_signal2 = librosa.istft(Y2, length=len(signal.x)) ipd.display(ipd.Audio(reconstructed_signal2, rate=signal.sr))
def visualize_audio_data(data_x, data_y, sr=44100): for x, y in zip(data_x, data_y): print('x data:') plt.plot(x) print('y data:') plt.plot(y, color='red') plt.show() print('x data:') ipd.display(ipd.Audio(x, rate=sr, autoplay=False)) print('y data:') ipd.display(ipd.Audio(y, rate=sr, autoplay=False)) plt.show() print('\n')
def resynthesize_sources_midi(W, H, midi, signal, tol_on, tol_off): channel_activations = initialize_activations(signal, midi, get_pitches(midi), tol_on, tol_off, by_channel=True) for channel in sorted(channel_activations.keys()): channel_H = channel_activations[channel] * H Y = numpy.dot(W, channel_H) * signal.X_phase print(f'Channel {channel}:') reconstructed_signal = librosa.istft(Y, length=len(signal.x)) ipd.display(ipd.Audio(reconstructed_signal, rate=signal.sr)) mask = numpy.dot(W, channel_H) / (numpy.dot(W, H) + numpy.finfo(float).eps) Y2 = mask * signal.S print(f'Channel {channel} (masked):') reconstructed_signal2 = librosa.istft(Y2, length=len(signal.x)) ipd.display(ipd.Audio(reconstructed_signal2, rate=signal.sr))
def nmf_display(W, H, signal, components): display_components(components) # Re-create the STFT from all NMF components. Y = numpy.dot(W, H) * signal.X_phase # Transform the STFT into the time domain. print('Reconstructed') reconstructed_signal = librosa.istft(Y, length=len(signal.x)) ipd.display(ipd.Audio(reconstructed_signal, rate=signal.sr)) print('Residual') residual = signal.x - reconstructed_signal residual[0] = 1 # hack to prevent automatic gain scaling ipd.display(ipd.Audio(residual, rate=signal.sr))
def compare_inverse(x, x_hat, res=None): wav = mels_to_wav(x.unsqueeze(0)) wav_hat = mels_to_wav(x_hat.unsqueeze(0)) plt.figure(figsize=[8, 8]) if res is not None: plt.subplot(2, 1, 1) plt.plot(res) plt.grid() plt.subplot(2, 2, 3) plt.imshow(x.detach().cpu(), origin='lower', cmap='magma') plt.subplot(2, 2, 4) plt.imshow(x_hat.detach().cpu(), origin='lower', cmap='magma') plt.show() ipd.display(ipd.Audio(wav, rate=16000)) ipd.display(ipd.Audio(wav_hat, rate=16000))
def exercise_beating(show_result=True): """Exercise 1: Beating Notebook: PCP_signal.ipynb""" if show_result is False: return Fs = 100 dur = 5 omega_1 = 10 omega_2 = 11 x1, t = generate_sinusoid(dur=dur, Fs=Fs, amp=0.5, freq=omega_1) x2, t = generate_sinusoid(dur=dur, Fs=Fs, amp=0.5, freq=omega_2) title = r'Beating with $\omega_1=%.1f$ and $\omega_2=%.1f$ (beating period: %.1f)' % \ (omega_1, omega_2, np.abs(omega_2-omega_1)) plot_interference(t, x1, x2, ylim=[-1.1, 1.1], xlim=[0, dur], title=title) plot_interference(t, x1, x2, ylim=[-1.1, 1.1], xlim=[1, 2], title=r'Zoom-in section') Fs = 4000 dur = 5 omega_1 = 200 omega_2 = 203 x1, t = generate_sinusoid(dur=dur, Fs=Fs, amp=0.5, freq=omega_1) x2, t = generate_sinusoid(dur=dur, Fs=Fs, amp=0.5, freq=omega_2) title = r'Beating with $\omega_1=%.1f$ and $\omega_2=%.1f$ (beating frequency: %.1f)' \ % (omega_1, omega_2, np.abs(omega_2-omega_1)) plot_interference(t, x1, x2, ylim=[-1.1, 1.1], xlim=[0, dur], title=title) ipd.display(ipd.Audio(x1 + x2, rate=Fs))
def generate_images(images, source='fake', save=True): # make sure the training parameter is set to False because we # don't want to train the batchnorm layer when doing inference. if(source=='fake'): disp_images = images['fake_images'] elif(source=='real'): disp_images = images['real_images'] else: raise ValueError plt.title(source.capitalize()+" log-magnitudes") for i in range(16): plt.subplot(4, 4, i+1) plt.imshow(np.transpose(disp_images[i, :, :, 0]) * 127.5, cmap="magma", origin="lower", aspect="auto") plt.axis('off') if(save): plt.savefig('images/image_at_{}_{}.png'.format(images['global_step'][0, 0], source)) plt.show() plt.title(source.capitalize()+" instantaneous frequencies") for i in range(16): plt.subplot(4, 4, i+1) plt.imshow(np.transpose(disp_images[i, :, :, 1]) * 127.5, cmap="magma", origin="lower", aspect="auto") plt.axis('off') plt.show() audio = data_helper.melspecgrams_to_waves(disp_images)[:, :, 0].eval(session=tf.Session()) * 100000 audio = audio.astype(np.float32) for i in range(0, 4): display.display(display.Audio(audio[i, :], rate=16000)) return images['global_step'][0, 0]
def play(a: np.array, rate: int = 44100, volume: float = 0.2, repeat: int = 1, autoplay=True): wave = np.tile(a, repeat) return ipd.Audio(wave, rate=rate, autoplay=autoplay)
def doStuff(): plt.figure(figsize=(20, 80)) offset = 40 duration = 1 x, sr = librosa.load(r'C:\Dev\tools\WavFiles\Kool & The Gang - Get Down On It.wav', offset=offset,duration=duration) print(x.shape) ipd.display(ipd.Audio(x, rate=sr)) hop_length = 128 n_fft = 4096 D = librosa.stft(x, n_fft=n_fft, hop_length=hop_length) plt.subplot(3, 1, 1) librosa.display.specshow(librosa.amplitude_to_db(librosa.magphase(D)[0], ref=numpy.max), y_axis='log',x_axis='time') pitches, magnitudes = librosa.core.piptrack(y=x, sr=sr, S=D, threshold=0.1) plt.subplot(3, 1, 2) librosa.display.specshow(pitches, y_axis='linear', x_axis='time') print(pitches.shape) plt.subplot(3, 1, 3) librosa.display.specshow(magnitudes, y_axis='linear', x_axis='time') average_magnitudes = numpy.average(magnitudes, 1) max_avg_mag = numpy.int(max(average_magnitudes)) step = 0.01 bins = numpy.arange(0,0.5,step) hist, bin_edges = numpy.histogram(average_magnitudes, bins) plt.clf() plt.bar(bin_edges[:-1], hist, width=step) plt.xlim(min(bin_edges), max(bin_edges)) mag_thresh = [index for index,val in average_magnitudes if val > 0.5] print(len(mag_thresh)) plt.draw() plt.show() m = pairwise_distances(magnitudes, metric=dtw_metric)
def show_wav(file_name): file_name = path_train + '0\\00.wav' plt.figure(figsize=(12, 4)) data, sample_rate = librosa.load(file_name) _ = librosa.display.waveplot(data, sr=sample_rate) ipd.Audio(file_name) plt.show()
def plot_graphs_for_freq(freq): x = np.sin(2 * np.pi * freq * n/Fs) plt.subplot(221) plt.xlabel('Amostras') plt.ylabel(f'$\sin(2 \pi \cdot {freq} \cdot n/8192)$') plt.title('50 Primeiras Amostras de $x[n]$') plt.stem(n[:50], x[:50]) plt.subplot(223) plt.xlabel('Tempo ($s$)') plt.ylabel(f'$\sin(2 \pi \cdot {freq} \cdot n/8192)$') plt.title('50 Primeiras Amostras de $x[n]$') plt.plot(t[:50], x[:50]) X, w = ctfts(x, 1/Fs) plt.subplot(222) plt.title('Magnitude de $X$ versus $f$') plt.ylabel('$|H(j\Omega)|$') plt.xlabel('Frequência ($Hz$)') plt.plot(w, np.abs(X)) plt.subplot(224) plt.title('Fase de $X$ versus $f$') plt.ylabel('$\\angle H(j\Omega)$') plt.xlabel('Frequência ($Hz$)') plt.plot(w, np.angle(filter_small_values(X), deg=True)) plt.tight_layout() plt.show() print(f'{freq}Hz Tone') ipd.display(ipd.Audio(x, rate=Fs)) print()
def fun1(audio_path): x, sr = librosa.load(audio_path) ipd.Audio(x, rate=sr) hop_length = 512 * 8 chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length) a = len(chromagram) b = len(chromagram[0]) #print(a,b) result = [[0] * b] * a # ret=get_wav_time(audio_path) #print(ret) cnt = 0 for i in range(b): max = 0 resul = '' for j in range(a): result[j][i] = re_map[j] if max < chromagram[j][i]: max = chromagram[j][i] resul = result[j][i] cnt = cnt + 1 if cnt > 17: print(result[j][i]) cnt = 0 return resul
def fun1(audio_path): x, sr = librosa.load(audio_path) ipd.Audio(x, rate=sr) hop_length = 512 * 8 chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length) a2 = len(chromagram) b2 = len(chromagram[0]) print(a2, b2) result = [[0] * b2] * a2 re = [0 for i in range(60)] # ret=get_wav_time(audio_path) #print(ret) cnt = 0 for i in range(0, b): max = 0 m = 0 for j in range(0, a2): result[j][i] = re_map[j] if max < chromagram[j][i]: max = chromagram[j][i] if cnt > 10: print(j, i, re[m]) re[m] = re_max cnt = 0 m = m + 1 return re
def add_secondary(clips): # INITIALIZE AUDIO AND GET DURATION y, sr = librosa.load(SONG_PATH) ipd.Audio(y, rate=sr) song_length = librosa.core.get_duration(y=y, sr=sr) # GET TOTAL TIME LINE DURATION total_clip_duration = 0 for clip in clips: total_clip_duration += clip.MPObj.duration # FIND DIFFERENCE BETWEEN TIME LINE AND SONG DURATION difference = song_length - total_clip_duration # PICK CLIPS LESS THAN DIFF selected_clips = [] secondary_bin = [f for f in listdir(SECONDARY_RELATIVE_PATH) if not f.startswith('.')] time_left = difference while time_left > 14: # CHOOSING 4 ARBITRARILY-ISH selected_clip = random.choice(secondary_bin) if int(selected_clip[len(selected_clip)-5: len(selected_clip)-4]) < time_left: mp_object = VideoFileClip("./secondary/" + selected_clip) selected_clips.append(Clip(mp_object)) time_left = time_left - int(selected_clip[len(selected_clip)-5: len(selected_clip)-4]) # ADD CLIPS RANDOMLY INTO ARRAY clips_plus_secondary = clips for x in selected_clips: clips_plus_secondary.insert(randint(0, len(clips_plus_secondary)), x) return clips_plus_secondary
def remove_noise_function(file_name): #read audio audio = f'{ file_name }.wav' path = os.fspath(audio) data, sr = librosa.load(path=path, duration=5.0) #Remoove noise # select section of data that is noise noise_len = 2 # seconds noise = band_limited_noise( min_freq=4000, max_freq=12000, samples=len(data), samplerate=sr) * 10 noise_clip = noise[:sr * noise_len] # perform noise reduction reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=noise_clip, verbose=True) #diaplay audio print('after remove ') t = ipd.Audio(reduced_noise, rate=sr) librosa.output.write_wav(f'{ file_name }.wav', reduced_noise, sr) # changing format from wav to flac wav_audio = AudioSegment.from_file(f"{ file_name }.wav", format="wav") wav_audio.export(f"{ file_name }.flac", format="flac")
def _make_audio_grid(ds, key, samplerate, rows, cols, plot_scale): """Plot the waveforms and IPython objects of some samples of the argument audio dataset Args: ds: `tf.data.Dataset`. The tf.data.Dataset object to visualize. key: The inferred key for the dataset samplerate : Inferred samplerate of the dataset. rows: `int`, number of rows of the display grid. cols: `int`, number of columns of the display grid. plot_scale: `float`, controls the plot size of the images. Keep this value around 3 to get a good plot. High and low values may cause the labels to get overlapped. Returns: fig: Waveform figure to display. IPython objects are not returned. """ import IPython.display as ipd plt = lazy_imports_lib.lazy_imports.matplotlib.pyplot num_examples = rows * cols examples = list(dataset_utils.as_numpy(ds.take(num_examples))) fig = plt.figure(figsize=(plot_scale * cols, plot_scale * rows)) fig.subplots_adjust(hspace=1 / plot_scale, wspace=1 / plot_scale) t1 = 0 t2 = 100 * 1000 for i, ex in enumerate(examples): ax = fig.add_subplot(rows, cols, i + 1) ax.plot(ex[key]) audio = ex['audio'] newaudio = audio[t1:t2] ipd.display(ipd.Audio(newaudio, rate=samplerate)) plt.show() return fig
def transcribe(signal, model, norm=1, chroma=1, log=1): if isinstance(signal, util.piece): signal.downsample(16000) signal = signal.to_chunk(20) # plt.figure() # assert not (pXs[0] - Xs[0]).any() # pXs = Xs[:500] # pYs_exp = Ys[:500] pYs_act = model.predict_on_batch(signal) # pYs_act = pYs_act * np.arange(pYs_act.shape[1])[None,:] pYs_act += 1E-10 # compare(log = 1) if chroma: # pYs_exp = mroll2chroma(pYs_exp) pYs_act = mroll2chroma(pYs_act, norm=1) Z1 = pYs_act.T if log: plt.pcolormesh(Z1, alpha=1.0, norm=mpl.colors.LogNorm(vmin=Z1.min(), vmax=Z1.max())) else: plt.pcolormesh(Z1) fs = np.arange(pYs_act.shape[-1])[None, :] pYs_act = (pYs_act) * fs ipd.display(ipd.Audio(midi_roll_play(pYs_act), rate=16001.)) return pYs_act
def plot(file_name): plt.figure(figsize=(12, 4)) data, sample_rate = librosa.load(file_name) _ = librosa.display.waveplot(data, sr=sample_rate) ipd.Audio(file_name) """
def recordAudio(self): RATE = 16000 RECORD_SECONDS = 2.5 CHUNKSIZE = 1024 # initialize portaudio p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNKSIZE) print("***Recording ***") frames = [] for _ in range(0, int(RATE / CHUNKSIZE * RECORD_SECONDS)): data = stream.read(CHUNKSIZE) frames.append(np.fromstring(data, dtype=np.int16)) # Convert the list of numpy-arrays into a 1D array (column-wise) numpydata = np.hstack(frames) print("* done") # close stream stream.stop_stream() stream.close() p.terminate() ipd.Audio(numpydata, rate=RATE) dir = "testingData" filename = "\output.wav" wav.write(dir + filename, RATE, numpydata)
def play_sequence(sequence, synth=midi_synth.synthesize, sample_rate=_DEFAULT_SAMPLE_RATE, colab_ephemeral=True, **synth_args): """Creates an interactive player for a synthesized note sequence. This function should only be called from a Jupyter or Colab notebook. Args: sequence: A music_pb2.NoteSequence to synthesize and play. synth: A synthesis function that takes a sequence and sample rate as input. sample_rate: The sample rate at which to synthesize. colab_ephemeral: If set to True, the widget will be ephemeral in Colab, and disappear on reload (and it won't be counted against realtime document size). **synth_args: Additional keyword arguments to pass to the synth function. """ array_of_floats = synth(sequence, sample_rate=sample_rate, **synth_args) try: import google.colab # pylint: disable=unused-import,unused-variable,g-import-not-at-top colab_play(array_of_floats, sample_rate, colab_ephemeral) except ImportError: display.display(display.Audio(array_of_floats, rate=sample_rate))
def alert(self, n): if n >= self.total-1: # keep playing the last chunk if more sounds are needed n = self.total-1 data_chunk = self.wav[n*self.chunk_size:(n+1)*self.chunk_size] self.display.update(disp.Audio(data_chunk * self.envelope, #* self.volume rate=self.sample_rate, autoplay=True));
def solve(x_val, y_val, classes, model): model = load_model('best_model.hdf5') index = random.randint(0, len(x_val) - 1) samples = x_val[index].ravel() print("Audio:", classes[np.argmax(y_val[index])]) ipd.Audio(samples, rate=8000) print("Prediction:", predict(samples, model, classes))
def plot_fft_and_listen(filepath, raw_axis = False) : sr = 22050 x = load_wav(filepath) x_ft = np.abs(np.fft.fft(x)) time = np.arange(len(x),dtype=np.float) / sr freq = np.arange(len(x_ft), dtype=np.float) / len(x_ft) * sr if raw_axis: print 'sample rate:', sr print 'N: ', len(x) plt.figure() plt.subplot(2,1,1) if raw_axis: plt.plot(x) plt.xlabel('n') plt.ylabel('$x(n)$') else: plt.plot(time, x) plt.xlabel('time') plt.subplot(2,1,2) if raw_axis: plt.plot(x_ft) plt.xlabel('k') plt.ylabel('$|X(k)|$') plt.xlim(0, 3000*len(x) / sr) else: plt.plot(freq, x_ft) plt.xlim(0, 3000) plt.xlabel('Frequency (Hz)') return ipd.Audio(x, rate=sr)
def generate_audio(mel, waveglow, filepath, sample_rate=22050): with torch.no_grad(): audio = waveglow.infer(mel, sigma=0.666) audio = audio[0].data.cpu().numpy() audio = ipd.Audio(audio, rate=sample_rate) with open(filepath, "wb") as f: f.write(audio.data)