def compress_translated_sentence(self, translation, max_allowed_duration, locale, client): audio_bytes = translation.raw_audio samplerate = translation.samplerate # duration = frames / samplerate translation.compression_ratio = float( (audio_bytes.shape[0] / samplerate)) / float(max_allowed_duration) if client and client.gcloud_speedup and translation.compression_ratio > 1.0: audio_bytes, samplerate = sf.read( io.BytesIO( client.get_audio_chunk_for_sentence( translation.text, locale, speedup=translation.compression_ratio)), always_2d=True, ) # we might be slightly off on timing here. Gcloud isn't perfect # recalc the compression translation.compression_ratio = float( (audio_bytes.shape[0] / samplerate)) / float(max_allowed_duration) if translation.compression_ratio > 1.0: audio_bytes = pyrubberband.time_stretch( audio_bytes, samplerate, translation.compression_ratio) translation.audio = audio_bytes
def stretch_seg(self, frameCount): # parameter should be tuned # decrease or increase multiplier if the frame after adjusted is not # equal to frameCount approach_para = 0.00001 # # difference_upbound = 10 # if difference of two signal frame is bigger than difference_upbound, # multiplier would be adjusted # # approach_para_mul = 1 # increase the speed of approach ori = self.signal multiplier = float(self.chroma.shape[1] / frameCount) count = 1 # aprroaching if frame count is not same # but it should be useless right now QQ while self.chroma.shape[1] != frameCount: self.signal = pyrb.time_stretch(ori, self.sr, multiplier) self.chroma = librosa.feature.chroma_stft(self.signal) difference = self.chroma.shape[1]-frameCount if difference < 0: # signal after adjusted is too short multiplier -= approach_para elif difference > 0: # signal after adjusted is too long multiplier += approach_para # print 'count : ', count, ' difference : ', difference, # self.chroma.shape[1] count += 1 self.chroma = librosa.feature.chroma_stft(self.signal) self.tempo = librosa.beat.tempo(self.signal) self.spec = librosa.feature.melspectrogram(self.signal, sr=self.sr) print('adjusted : ', self.name, ' with ', count - 1, ' times aprroaching', ' with multiplier = ', multiplier)
def syncBlocks(path, CSM, beats1, beats2, Fs, hopSize, XAudio1, XAudio2, BeatsPerBlock, fileprefix=""): """ :param path: Px2 array representing a partial warping path to align two songs :param CSM: The cross similarity matrix between two songs :param beats1: An array of beat onsets for song 1 in increments of hopSize :param beats2: An array of beat onsets for song 2 in increments of hopSize :param XAudio1: The raw audio samples for song 1 :param XAudio2: The raw audio samples for song 2 :param BeatsPerBlock: The number of beats per block for each pixel in the CSM :param fileprefix: Prefix of each stretched block to save. By default, blank,\ so no debugging info saved :returns (XFinal: An NSamples x 2 array with the first song along the first column\ and the second synchronized song along the second column,\ beatsFinal: An array of the locations in samples of the beat onsets in XFinal \ scoresFinal: An array of matching scores for each beat) """ XFinal = np.array([[0, 0]]) beatsFinal = [] #The final beat locations based on hop size scoresFinal = [] for i in range(path.shape[0]): [j, k] = [path[i, 0], path[i, 1]] if j >= CSM.shape[0] or k >= CSM.shape[1]: break scoresFinal.append(CSM[j, k]) t1 = beats1[j] * hopSize t2 = beats1[j + BeatsPerBlock] * hopSize s1 = beats2[k] * hopSize s2 = beats2[k + BeatsPerBlock] * hopSize x1 = XAudio1[t1:t2] x2 = XAudio2[s1:s2] #Figure out the time factor by which to stretch x2 so it aligns #with x1 fac = float(len(x1)) / len(x2) print("fac = ", fac) x2 = pyrb.time_stretch(x2, Fs, 1.0 / fac) print("len(x1) = %i, len(x2) = %i" % (len(x1), len(x2))) N = min(len(x1), len(x2)) x1 = x1[0:N] x2 = x2[0:N] X = np.zeros((N, 2)) X[:, 0] = x1 X[:, 1] = x2 if len(fileprefix) > 0: filename = "%s_%i.mp3" % (fileprefix, i) sio.wavfile.write("temp.wav", Fs, X) subprocess.call(["avconv", "-i", "temp.wav", filename]) beat1 = beats1[j + 1] * hopSize - t1 beatsFinal.append(XFinal.shape[0]) XFinal = np.concatenate((XFinal, X[0:beat1, :])) return (XFinal, beatsFinal, scoresFinal)
def rubberband(incr, path1, path2): y, sr = librosa.load(path1, sr=None) y_stretched = pyrubberband.time_stretch(y, sr, incr) sf.write(path2, y_stretched, sr, format='wav') label1 = Label(w, text="Done !") label1.pack()
def do_agumentation(self): no_class = os.listdir(self.input_path) for name in no_class: files = os.listdir(self.input_path + name + "/") for i, audio in enumerate(files): y, sr = sf.read(self.input_path + name + "/" + audio) y_strech = pyrb.time_stretch(y, sr, 2.0) wav.write(self.output_path + name, sr, y_strech) print(name, "has augmented and saved")
def stretch_audio(filepath): y, sr = librosa.load(filepath, sr=None) y_stretched = pyrubberband.time_stretch(y, sr, args.stretch_constant) sf.write(filepath + str(args.stretch_constant) + '.wav', y_stretched, sr, format='wav') sf.write(filepath, y_stretched, sr, format='wav')
def timeStretch(input_tempo): y_shift, sr = librosa.load(can_ps_output, sr=44100) y_tempo = mas.get_tempo(can_ps_output) print("can_tempo:{}".format(y_tempo)) rate = float(input_tempo) / y_tempo print("stretch_rate:{}".format(rate)) #librosa.effects.time_stretch(y_shift, rate) #by liborsa y_stretch_shift = pyrb.time_stretch(y_shift, sr, rate) # seg11 sf.write(can_output, y_stretch_shift, samplerate=44100)
def __call__(self, x, stretch=1): """Stretch the time of given signal Args: x (numpy.ndarray): input signal (n_samples,) stretch (float, int): degree of stretching (unit:ratio) Returns: numpy.ndarray: output (n_samples,) """ y = pyrb.time_stretch(x, self.sample_rate, stretch) return y
def __call__(self, wav=None, sr=None): assert len(wav.shape)==1 if random.random() < self.prob: alpha = 1.0 + self.limit * random.uniform(-1, 1) if self.use_pyrb: _wav = pyrb.time_stretch(wav, sr, alpha) else: _wav = librosa.effects.time_stretch(wav, alpha) if _wav.shape[0] < self.max_duration: wav = _wav return {'wav':wav,'sr':sr}
def _stretched_audio_by_incre_bpm(song, beats, incre_bpm, sr): samples = np.array([]).reshape(2,0) for i in range(len(beats)-1): # strech all samples between beat i to the begining of the next beat sample = librosa.frames_to_samples(beats[i:i+2]) y_raw = song.raw_audio_duo[:, sample[0]:sample[1]] stretch_ratio = incre_bpm[i] / song.bpm # transpose here twice because pyrb takes (n,2) while librosa takes (2,n) t_y_raw = y_raw.transpose() t_y_stretch = pyrb.time_stretch(t_y_raw, sr, stretch_ratio) samples = np.concatenate([samples, t_y_stretch.transpose()], axis=1) return samples
def change_tempo(audio, bpm, new_bpm): y = np.array(audio.get_array_of_samples()) if audio.channels == 2: y = y.reshape((-1, 2)) sample_rate = audio.frame_rate tempo_ratio = new_bpm / bpm y_fast = pyrb.time_stretch(y, sample_rate, tempo_ratio) channels = 2 if (y_fast.ndim == 2 and y_fast.shape[1] == 2) else 1 y = np.int16(y_fast * 2 ** 15) new_seg = AudioSegment(y.tobytes(), frame_rate=sample_rate, sample_width=2, channels=channels) return new_seg
def do_agumentation(self): no_class = os.listdir(self.input_path) for name in no_class: files = os.listdir(self.input_path + name + "/") files = [f for f in files if f.endswith(".wav")] for i, audio in enumerate(files): print(audio) y, sr = sf.read(self.input_path + name + "/" + audio) time = random.uniform(0.6, 1.3) y_strech = pyrb.time_stretch(y, sr, time) y_agument = pyrb.pitch_shift(y_strech, 22050, 1) # print(y_agument) wav.write(self.output_path + name + "/" + "agumented_" + audio, sr, y_agument) print(name + "/" + "agumented_" + audio, "has augmented and saved")
def change_audioseg_tempo(segment, scale): y = np.array(segment.get_array_of_samples()) if segment.channels == 2: y = y.reshape((-1, 2)) sr = segment.frame_rate y_fast = pyrb.time_stretch(y, sr, scale) channels = 2 if (y_fast.ndim == 2 and y_fast.shape[1] == 2) else 1 y = np.int16(y_fast * 2**15) new_seg = AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels) return new_seg
def change_speed_only(sound, tempo_ratio): y = np.array(sound.get_array_of_samples()) if sound.channels == 2: y = y.reshape((-1, 2)) sample_rate = sound.frame_rate y_fast = pyrubberband.time_stretch(y, sample_rate, tempo_ratio) channels = 2 if (y_fast.ndim == 2 and y_fast.shape[1] == 2) else 1 y = np.int16(y_fast * 2**15) new_seg = AudioSegment(y.tobytes(), frame_rate=sample_rate, sample_width=2, channels=channels) return new_seg
def audio_stretch(audio_path, start_point, end_point, stretch, out_path): full_audio = AudioSegment.from_wav(audio_path) to_stetch = full_audio[start_point * 1000:end_point * 1000] y, sr = sf.read(audio_path) to_stretch_array = np.asarray(to_stetch.get_array_of_samples()) stretched_audio = pyrb.time_stretch(to_stretch_array, sr, 1 / stretch) librosa.output.write_wav('temp.wav', stretched_audio, sr) part1, _ = librosa.core.load(audio_path, duration=start_point, sr=sr) part2, _ = librosa.core.load('temp.wav', sr=sr) part3, _ = librosa.core.load(audio_path, offset=end_point, sr=sr) final_audio = np.concatenate((part1, part2, part3)) librosa.output.write_wav(out_path, final_audio, sr)
def test_stretch(sr, random_signal, num_samples, rate): '''Test shape of random signals with stretching factor of various rate. ''' # input signal of shape (channels, sr * duration) y = random_signal y_s = pyrubberband.time_stretch(y, sr, rate=rate) # test if output dimension matches input dimension assert y_s.ndim == y.ndim # check shape if y.ndim > 1: # check number of channels assert y.shape[1] == y_s.shape[1] else: # check num_samples (stretching factor) assert np.allclose(y_s.shape[0] * rate, y.shape[0])
def concatOnsets(self, sequence, corpusUnits, targetUnits, stretchUnits=False, windowUnits=False): """Concatenate audio units back to back with optional time stretching to match the target Can also optionally window the audio :param sequence: list of indices into the corpusUnits :param corpusUnits: list of corpus unit audio signals :param targetUnits: list of target unit audio signals :param shouldStretch: stretch the corpus unit to match the target unit :param shouldWindow: apply a window to the signal :return: an audio signal """ import pyrubberband as pyrb audio = [] for i, item in enumerate(sequence): corpusUnit = corpusUnits[item] #Use Rubber Band to stretch the audio to match the target if stretchUnits: factor = len(corpusUnit) / float(len(targetUnits[i])) corpusUnit = pyrb.time_stretch(corpusUnit, 44100, factor) # Envelope the output audio using a hamming window if windowUnits: window = np.hamming(len(audio)) audio *= window audio = np.append(audio, corpusUnit) return audio
def makeAnalogy(X, Fs, beatsA, filename_b, hopSize, winSize, ws, TempoBias, MFCCWeight=1.0, HPCPWeight=1.0): """ Make a cover song analogy; given audio for (A, A'), and B, \ make B' :param X: Audio waveform for A and A'; A is in first column, A' in second :param Fs: Sample rate of all audio files :param beatsA: Beat onsets (in samples) :param hopSize: Feature hop size :param winSize: Window size for MFCCs and HPCPs :param ws: Window weights of all features :param TempoBias: Tempo bias for beat tracking of song B """ #Step 1: Load in new example from artist 1 (B song) print("Loading new example...") XA = X[:, 0] XAp = X[:, 1] XB, Fs2 = librosa.load(filename_b) XB = librosa.core.to_mono(XB) #Step 2: Use rubberband library to change tempo of B so that #it's in line with tempo of song A tempoB, beatsB = librosa.beat.beat_track(XB, Fs2, start_bpm=TempoBias, hop_length=hopSize) tempoA = 60.0 / (np.mean(beatsA[1::] - beatsA[0:-1]) / float(Fs)) print("tempoA = %g, tempoB = %g" % (tempoA, tempoB)) ratio = float(tempoA) / tempoB print("Shifting by ratio: %g" % ratio) XB = pyrb.time_stretch(XB, Fs2, ratio)
def augment(self, array, count): # Original signal # Second argument determines type of aumentation applied to signal self.sigToImage(array, 1, count) # Noise addition using normal distribution with mean = 0 and std =1 # Permissible noise factor value = x > 0.004 noiseAdding = array + 0.009 * np.random.normal(0, 1, len(array)) self.sigToImage(noiseAdding, 2, count) # Permissible factor values = samplingRate / 100 timeShifting = np.roll(array, int(500 / 100)) self.sigToImage(timeShifting, 3, count) # Permissible factor values = -5 <= x <= 5 pitchShifting = pyrb.pitch_shift(array, 500, -3) self.sigToImage(pitchShifting, 4, count) # Permissible factor values = 0 < x < 1.0 factor = 0.95 # Yields the best reults without losing ecg wave shape timeStretching = pyrb.time_stretch(array, 500, factor) self.sigToImage(timeStretching, 5, count)
def testNMF2DMusic(K, T, F, NIters = 300, bins_per_octave = 24, shiftrange = 6, \ ZoomFac = 8, Trial = 0, Joint3Way = False, \ W1Fixed = False, HFixed = False, doKL = False): """ :param Joint3Way: If true, do a joint embedding with A, Ap, and B\ If false, then do a joint embedding with (A, Ap) and represent\ B in the A dictionary """ import librosa from scipy.io import wavfile import pyrubberband as pyrb #Synthesizing AAF's "Bad" """ Fs, X = wavfile.read("music/SmoothCriminalAligned.wav") X = np.array(X, dtype=np.float32) A = X[:, 0]/(2.0**15) Ap = X[:, 1]/(2.0**15) #Take 20 seconds clips from each A = A[0:Fs*20] Ap = Ap[0:Fs*20] B, Fs = librosa.load("music/MJBad.mp3") B = B[Fs*3:Fs*23] #A and A' tempos are from the synchronization code tempoA = 0.508 tempoAp = 0.472 tempoB = 0.53 songname = "mj" #A good separation I got before res = sio.loadmat("FinalExamples/MJAAF_Bad/Joint2DNMFFiltered_K3_Z4_T20_Bins24_F14_Trial2/NMF2DJoint.mat") W1 = res['W1'] W2 = res['W2'] H1 = res['H1'] do2DFilteredAnalogy(A, Ap, B, Fs, K, T, F, NIters, bins_per_octave, shiftrange, \ ZoomFac, Trial, Joint3Way, W1Fixed, HFixed, doKL, songname=songname, W1=W1, W2=W2, H1=H1) """ #Synthesizing AAF's "Wanna Be Starting Something" """ Fs, X = wavfile.read("music/SmoothCriminalAligned.wav") X = np.array(X, dtype=np.float32) A = X[:, 0]/(2.0**15) Ap = X[:, 1]/(2.0**15) #Take 20 seconds clips from each A = A[0:Fs*20] Ap = Ap[0:Fs*20] B, Fs = librosa.load("music/MJStartinSomething.mp3") #tempos = getTempos(A, Ap, B, Fs) tempoA = 0.508 tempoAp = 0.472 tempoB = 0.49 B = pyrb.time_stretch(B, Fs, tempoB/tempoA) B = B[0:Fs*20] songname = "wanna" res = sio.loadmat("FinalExamples/MJAAF_Bad/Joint2DNMFFiltered_K3_Z4_T20_Bins24_F14_Trial2/NMF2DJoint.mat") W1 = res['W1'] W2 = res['W2'] H1 = res['H1'] res = do2DFilteredAnalogy(A, Ap, B, Fs, K, T, F, NIters, bins_per_octave, shiftrange, \ ZoomFac, Trial, Joint3Way, W1Fixed, HFixed, doKL, songname=songname, W1=W1, W2=W2, H1=H1) Y = res['Y'] foldername = res['foldername'] Y = pyrb.time_stretch(Y, Fs, tempoA/tempoB) wavfile.write("%s/BpFinalStretched.wav"%foldername, Fs, Y) """ #Synthesizing Marilyn Manson "Who's That Girl" Fs, X = wavfile.read("music/SweetDreams/SweetDreamsAlignedClip.wav") X = np.array(X, dtype=np.float32) A = X[:, 0]/(2.0**15) Ap = X[:, 1]/(2.0**15) #Take 20 seconds clips from each A = A[0:Fs*20] Ap = Ap[0:Fs*20] B, Fs = librosa.load("music/SweetDreams/WhosThatGirlClip.wav") B = B[0:Fs*20] tempoA = 0.477 tempoB = 0.65 songname = "eurythmics" res = do2DFilteredAnalogy(A, Ap, B, Fs, K, T, F, NIters, bins_per_octave, shiftrange, \ ZoomFac, Trial, Joint3Way, W1Fixed, HFixed, doKL, songname=songname) Y = res['Y'] foldername = res['foldername'] Y = pyrb.time_stretch(Y, Fs, tempoA/tempoB) wavfile.write("%s/BpFinalStretched.wav"%foldername, Fs, Y)
def audio(mudabox, state): # Deform the audio and metadata mudabox._audio['y'] = pyrb.time_stretch(mudabox._audio['y'], mudabox._audio['sr'], state['rate'])
def testMIDIExample(T, F, NIters = 300, bins_per_octave = 24, shiftrange = 6, \ ZoomFac = 8, Trial = 0, HFixed = False, doKL = True): import librosa from scipy.io import wavfile import pyrubberband as pyrb from CQT import getNSGT initParallelAlgorithms() path = "music/MIDIExample/BeeGeesTracks/" NTracks = 6 W1 = np.array([]) H1 = np.array([]) startidx = 27648 #Where the synchronized path starts for track in range(NTracks): matfilename = "%s/WH%i_F%i_T%i_Z%i_Trial%i.mat"%(path, track+1, F, T, ZoomFac, Trial) if not os.path.exists(matfilename): X, Fs = librosa.load("%s/%i.mp3"%(path, track+1)) X = X[startidx:startidx+Fs*10] wavfile.write("Track%i.wav"%track, Fs, X) print("Doing CQT of track %i..."%track) C0 = getNSGT(X, Fs, bins_per_octave) #Zeropad to nearest even factor of the zoom factor NRound = ZoomFac*int(np.ceil(C0.shape[1]/float(ZoomFac))) C = np.zeros((C0.shape[0], NRound), dtype = np.complex) C[:, 0:C0.shape[1]] = C0 C = np.abs(C) C = scipy.ndimage.interpolation.zoom(C, (1, 1.0/ZoomFac)) plotfn = lambda V, W, H, iter, errs: plotNMF2DConvSpectra(V, W, H, iter, errs, hopLength = 128) (Wi, Hi) = doNMF2DConvGPU(C, 1, T, F, L=100, doKL = doKL, plotfn = plotfn, plotInterval=400) sio.savemat(matfilename, {"W":Wi, "H":Hi}) else: res = sio.loadmat(matfilename) Wi = res["W"] Hi = res["H"] if W1.size == 0: W1 = np.zeros((T, Wi.shape[1], NTracks)) H1 = np.zeros((F, NTracks, Hi.shape[2])) Wi = np.reshape(Wi, [Wi.shape[0], Wi.shape[1]]) Hi = np.reshape(Hi, [Hi.shape[0], Hi.shape[2]]) W1[:, :, track] = Wi H1[:, track, :] = Hi K = NTracks Fs, X = wavfile.read("music/MIDIExample/stayinalivesyncedclip.wav") X = np.array(X, dtype=np.float32) A = X[:, 0]/(2.0**15) Ap = X[:, 1]/(2.0**15) #Take 10 seconds clips from each A = A[0:Fs*10] Ap = Ap[0:Fs*10] B, Fs = librosa.load("music/MIDIExample/TupacMIDIClip.mp3") tempoA = 0.578 tempoB = 0.71 B = pyrb.time_stretch(B, Fs, tempoB/tempoA) wavfile.write("BStretched.wav", Fs, B) B = B[0:Fs*10] songname = "madatchya" if not HFixed: H1 = np.array([]) res = do2DFilteredAnalogy(A, Ap, B, Fs, K, T, F, NIters, bins_per_octave, shiftrange, \ ZoomFac, Trial, False, W1Fixed=True, HFixed=HFixed, doKL = doKL, W1 = W1, H1=H1, songname=songname) Y = res['Y'] foldername = res['foldername'] Y = pyrb.time_stretch(Y, Fs, tempoA/tempoB) wavfile.write("%s/BpFinalStretched.wav"%foldername, Fs, Y)
original_word_len = (len(word)) / sr # in secs original_samp = len(word) rise_samp = int(sr * rise_sec) # convert rise len to samples end_dur = int(beat_length * sr) # convert beat len to samples...should be 7350 uncorrected_stretch = original_samp / end_dur corrected_onset = int(onset / uncorrected_stretch) # compute stretch factor stretch_factor = original_samp / (end_dur + (corrected_onset - rise_samp)) # Stretch #word = librosa.effects.time_stretch(word, stretch_factor) word = pyrb.time_stretch(word, sr, stretch_factor) # compute stretched onset #onset = int(onset / stretch_factor) onset = librosa.onset.onset_detect(word, units='samples') onset = onset[0] # just return the first onset # start sound file from the rise-len before the onset word = word[int(onset - rise_samp):] # correction procedure because librosa stretch doens't work properly # compute diff between desired duration and actual difference = end_dur - len(word) # zero pad the difference if difference > 0:
ysync_bad[:, 1] = x2 sio.wavfile.write("sync_bad.wav", sr, ysync_bad) res = DTW(X1, X2) D, path = res['D'], res['path'] indices = [] for i in range(D.shape[0]): indices.append([]) for p in path: indices[p[0]].append(p[1]) x2sync = [] for i, js in enumerate(indices): j1 = min(js) j2 = max(js) k = j2 - j1 + 1 print(i, k) if k > 1: x = x2[hop * j1:hop * (j2 + 1)] x = pyrb.time_stretch(x, sr, k) x2sync += x.tolist() elif k == 1: x = x2[hop * j1:hop * (j2 + 1)] x2sync += x.tolist() x2sync = np.array(x2sync) y = np.zeros((x1.size, 2)) y[:, 0] = x1 y[0:x2sync.size, 1] = x2sync sio.wavfile.write("sync.wav", sr, y)
i = 0 desired_tempo = 100 outfpath = "output/%s-%sbpm.flac" % (list(document.sents)[0].string[:10], desired_tempo) for fpath in list(outfiles): # Load the file from disk, trim excess silence, slow it down a bit and # concatenate it to the full file sentence, sr = librosa.core.load(fpath) trimmed, index = librosa.effects.trim(sentence) onset_env = librosa.onset.onset_strength(y=trimmed, sr=sr) tempo = librosa.beat.tempo(y=trimmed, onset_envelope=onset_env) rate = desired_tempo / int(tempo) print(fpath, tempo, rate) slowed = pyrb.time_stretch(trimmed, sr, rate) if i != 0: outf, sr = librosa.core.load(outfpath) z = np.append(outf, slowed) sf.write(outfpath, z, sr, format='flac', subtype='PCM_24') else: sf.write(outfpath, slowed, sr, format='flac', subtype='PCM_24') i += 1 print("\nSaved whole text as %s\n\
import wave import sys from pydub import AudioSegment import soundfile as sf import pyrubberband as pyrb # sound = AudioSegment.from_mp3(sys.argv[1]) # sound.export("file.wav", format="wav") y, sr = sf.read("0.wav") y_stretch = pyrb.time_stretch(y, sr, 0.90) y_shift = pyrb.pitch_shift(y, sr, 0.90) sf.write("analyzed_filepathX5.wav", y_stretch, sr, format='wav')
import scipy def load_wav(fname): srate, audio = wav.read(fname) audio = audio.astype(np.float32) / 32767.0 audio = (0.9 / np.max(audio)) * audio # convert to mono if (len(audio.shape) == 2): audio = (audio[:, 0] + audio[:, 1]) / 2 return (audio, srate) dreamer, srate = load_wav('dreamer.wav') dreamer_live, srate = load_wav('dreamer_live.wav') dreamer_slow = pyrb.time_stretch(dreamer, srate, 0.75) goodbye_stranger, srate = load_wav('goodbye_stranger.wav') naima, srate = load_wav('naima.wav') # Constant Q transform represents energy among diffrent pitch classes across time # Beat synchronous chroma vectors reduce the size of chroma vectors and actually make the representation # tempo invariant def plot_chromagram(y): y = y[0:8000000] C = librosa.feature.chroma_cqt(y, sr=srate, bins_per_octave=12, norm=2) plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) # Display the chromagram: the energy in each chromatic pitch class as a function of time
def generate_labels_features_voca(self, all_list): pid = os.getpid() mp3_config, feature_config, mp3_str, feature_str = self.config_to_folder( ) i = 0 # number of songs j = 0 # number of impossible songs k = 0 # number of tried songs total = 0 # number of generated instances stretch_factors = [1.0] shift_factors = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6] loop_broken = False for song_name, lab_path, mp3_path, save_path in all_list: save_path = save_path + '_voca' # different song initialization if loop_broken: loop_broken = False i += 1 print(pid, "generating features from ...", os.path.join(mp3_path)) if i % 10 == 0: print(i, ' th song') original_wav, sr = librosa.load(os.path.join(mp3_path), sr=mp3_config['song_hz']) # save_path, mp3_string, feature_string, song_name, aug.pt result_path = os.path.join(save_path, mp3_str, feature_str, song_name.strip()) if not os.path.exists(result_path): os.makedirs(result_path) # calculate result for stretch_factor in stretch_factors: if loop_broken: loop_broken = False break for shift_factor in shift_factors: # for filename idx = 0 try: chord_info = self.Chord_class.get_converted_chord_voca( os.path.join(lab_path)) except Exception as e: print(e) print(pid, " chord lab file error : %s" % song_name) loop_broken = True j += 1 break k += 1 # stretch original sound and chord info x = pyrb.time_stretch(original_wav, sr, stretch_factor) x = pyrb.pitch_shift(x, sr, shift_factor) audio_length = x.shape[0] chord_info[ 'start'] = chord_info['start'] * 1 / stretch_factor chord_info['end'] = chord_info['end'] * 1 / stretch_factor last_sec = chord_info.iloc[-1]['end'] last_sec_hz = int(last_sec * mp3_config['song_hz']) if audio_length + mp3_config['skip_interval'] < last_sec_hz: print('loaded song is too short :', song_name) loop_broken = True j += 1 break elif audio_length > last_sec_hz: x = x[:last_sec_hz] origin_length = last_sec_hz origin_length_in_sec = origin_length / mp3_config['song_hz'] current_start_second = 0 # get chord list between current_start_second and current+song_length while current_start_second + mp3_config[ 'inst_len'] < origin_length_in_sec: inst_start_sec = current_start_second curSec = current_start_second chord_list = [] # extract chord per 1/self.time_interval while curSec < inst_start_sec + mp3_config['inst_len']: try: available_chords = chord_info.loc[ (chord_info['start'] <= curSec) & (chord_info['end'] > curSec + self.time_interval)].copy() if len(available_chords) == 0: available_chords = chord_info.loc[( (chord_info['start'] >= curSec) & (chord_info['start'] <= curSec + self.time_interval)) | ( (chord_info['end'] >= curSec) & (chord_info['end'] <= curSec + self.time_interval))].copy() if len(available_chords) == 1: chord = available_chords['chord_id'].iloc[ 0] elif len(available_chords) > 1: max_starts = available_chords.apply( lambda row: max(row['start'], curSec), axis=1) available_chords['max_start'] = max_starts min_ends = available_chords.apply( lambda row: min( row.end, curSec + self. time_interval), axis=1) available_chords['min_end'] = min_ends chords_lengths = available_chords[ 'min_end'] - available_chords[ 'max_start'] available_chords[ 'chord_length'] = chords_lengths chord = available_chords.loc[ available_chords['chord_length']. idxmax()]['chord_id'] else: chord = 169 except Exception as e: chord = 169 print(e) print(pid, "no chord") raise RuntimeError() finally: # convert chord by shift factor if chord != 169 and chord != 168: chord += shift_factor * 14 chord = chord % 168 chord_list.append(chord) curSec += self.time_interval if len(chord_list ) == self.no_of_chord_datapoints_per_sequence: try: sequence_start_time = current_start_second sequence_end_time = current_start_second + mp3_config[ 'inst_len'] start_index = int(sequence_start_time * mp3_config['song_hz']) end_index = int(sequence_end_time * mp3_config['song_hz']) song_seq = x[start_index:end_index] etc = '%.1f_%.1f' % (current_start_second, current_start_second + mp3_config['inst_len']) aug = '%.2f_%i' % (stretch_factor, shift_factor) if self.feature_name == FeatureTypes.cqt: feature = librosa.cqt( song_seq, sr=sr, n_bins=feature_config['n_bins'], bins_per_octave=feature_config[ 'bins_per_octave'], hop_length=feature_config['hop_length'] ) else: raise NotImplementedError if feature.shape[ 1] > self.no_of_chord_datapoints_per_sequence: feature = feature[:, :self. no_of_chord_datapoints_per_sequence] if feature.shape[ 1] != self.no_of_chord_datapoints_per_sequence: print( 'loaded features length is too short :', song_name) loop_broken = True j += 1 break result = { 'feature': feature, 'chord': chord_list, 'etc': etc } # save_path, mp3_string, feature_string, song_name, aug.pt filename = aug + "_" + str(idx) + ".pt" torch.save(result, os.path.join(result_path, filename)) idx += 1 total += 1 except Exception as e: print(e) print(pid, "feature error") raise RuntimeError() else: print( "invalid number of chord datapoints in sequence :", len(chord_list)) current_start_second += mp3_config['skip_interval'] print(pid, "total instances: %d" % total)
def time_stretching(sig, sr, degree): return pyrb.time_stretch(sig, sr, degree)
def write_audio_file(path, name, voice, audio, sampling_rate): file_name = path + \ time.strftime("%Y%m%d-%H%M%S_") + name + str(randint(0, 100)) + ".wav" if voice == "satan:": temp_file_name = path + "temp.wav" write(temp_file_name, sampling_rate, audio) fixed_framerate = 11000 sound = AudioSegment.from_file(temp_file_name) sound = sound.set_frame_rate(fixed_framerate) write(file_name, fixed_framerate, audio) y, sr = sf.read(file_name) y_stretch = pyrb.time_stretch(y, sr, 1.6) y_shift = pyrb.pitch_shift(y, sr, 1.6) sf.write(file_name, y_stretch, sr, format='wav') sound = AudioSegment.from_wav(file_name) sound.export(file_name, format="wav") elif voice == "vader:": temp_file_name = path + "temp.wav" write(temp_file_name, sampling_rate, audio) AudioEffect.robotic(temp_file_name, file_name) y, sr = sf.read(file_name) y_stretch = pyrb.time_stretch(y, sr, 0.9) y_shift = pyrb.pitch_shift(y, sr, 0.9) sf.write(file_name, y_stretch, sr, format='wav') sound = AudioSegment.from_wav(file_name) sound.export(file_name, format="wav") else: write(file_name, sampling_rate, audio) return file_name