def test_missing_cli(cli, ctx): '''Simulate not having rubberband-cli installed and check for the appropriate exception. ''' with ctx: pyrubberband.pyrb.__RUBBERBAND_UTIL = cli pyrubberband.pitch_shift(np.random.randn(22050), 22050, 1)
def pitch_shift_test(self): ''' Function for testing the audio degredation created by the pitch shifting This is now ahead of the other stuff TODO figure out why the duration is exploding -> bytes are being increased when we filter ''' print("In test") print("Ownership Duration") #print(self.ownership_AudioSegment.duration_seconds) channles = self.ownership_AudioSegment.channels sample_width = self.ownership_AudioSegment.sample_width semitones = 5 #Semitones(200) + minHuman speech(100) = inaudible Audio #Set high and low bounds on original Ownership Audio seg = self.ownership_AudioSegment seg.set_frame_rate(self.original_sample_rate) cleaned_ownership_sound = seg.low_pass_filter(200) #Filter out noise above this cleaned_ownership_sound2 = cleaned_ownership_sound.high_pass_filter(50) #And below this (human speech is typically 100-150Hz) cleaned_ownership_sound2.set_frame_rate(self.original_sample_rate) cleaned_ownership_sound2.export("./pst_original.mp3", format="mp3") #This is just so we can see it ownership_wav_data = np.array(cleaned_ownership_sound2.get_array_of_samples()) #Pitch shift ownership audio to inaudible inaudible_samples = pyrubberband.pitch_shift(ownership_wav_data, self.original_sample_rate, semitones) # Pitch shift back down to audible and clean again audible_samples = pyrubberband.pitch_shift(inaudible_samples, self.original_sample_rate, -semitones) print(audible_samples.shape) print("checking eq") print(np.allclose(audible_samples, ownership_wav_data)) #samples are way off, why mse = (np.square(audible_samples - ownership_wav_data)).mean(axis=0) print(mse) #Huge MSE off of a pitch shift of just 5 sf.write('./pst_temp.wav', audible_samples, self.original_sample_rate) sound = AudioSegment.from_file('./pst_temp.wav', format="wav", sample_width=sample_width, frame_rate=self.original_sample_rate, channels=channles) other_samples = np.array(sound.get_array_of_samples()) print(other_samples.shape) print("Transformed duration: ") print(sound.duration_seconds) cleaned_sound = sound.low_pass_filter(300) cleaned_sound2 = cleaned_sound.high_pass_filter(2) cleaned_sound2.set_frame_rate(self.original_sample_rate) cleaned_sound2 = cleaned_sound2+10 print("Cleaned Transformed Duration") print(cleaned_sound2.duration_seconds) cleaned_sound2.export('./pst_transformed.mp3', format="mp3")
def postprocess_source(file: Path, config: DatasetConfig): """ Applies postprocessing on main source signal (positive or negative word) :param file: path to main signal file :param config: dataset config :return: completed source signal """ main_signal, _ = librosa.load(str(file), sr=config.sample_rate) undercuts = [np.random.uniform(*config.word_undercut_range) for _ in range(2)] left, right = [config.sec_to_samples(x) for x in undercuts] main_signal = main_signal[left:-right] if np.random.random() < config.rubberband_ratio: pitch = np.random.uniform(*config.rubberband_pitch_range) main_signal = pyrubberband.pitch_shift(main_signal, config.sample_rate, pitch) bg_source = get_weighted_item(config.background_sources) bg_file = get_random_file(AUDIOSET_PATH / bg_source) bg_signal, _ = librosa.load(str(bg_file), sr=config.sample_rate) bg_signal = get_random_chunk(bg_signal, config.sample_length, config.sample_rate) bg_gain = np.random.uniform(*config.background_gain_range) bg_signal = bg_gain * librosa.util.normalize(bg_signal) * np.max(main_signal) right_margin = np.random.uniform(*config.word_right_margin_range) main_signal_end = config.sample_length - right_margin return merge_signals( (bg_signal, 0, None), (main_signal, None, main_signal_end), sample_rate=config.sample_rate, length=config.sample_length, )
def getPitchShiftedSpecs(X, Fs, W, H, shiftrange = 6, GapWins = 10): """ Concatenate a bunch of pitch shifted versions of the spectrograms of a sound, using the rubberband library :param X: A mono audio array :param Fs: Sample rate :param W: Window size :param H: Hop size :param shiftrange: The number of halfsteps below and above which \ to shift the sound :returns SRet: The concatenate spectrogram """ SRet = np.array([]) for shift in range(-shiftrange, shiftrange+1): print("Computing STFT pitch shift %i"%shift) if shift == 0: Y = np.array(X) else: Y = pyrb.pitch_shift(X, Fs, shift) S = STFT(Y, W, H) Gap = np.zeros((S.shape[0], GapWins), dtype=np.complex) if SRet.size == 0: SRet = S else: SRet = np.concatenate((SRet, Gap, S), 1) return SRet
def convert_from_human_inaudible(self): #TODO Find out where degredation in negative shift is coming from -> alternativly could apply fade to require lesser pitch shift #TODO Find out a way to do this without access to the original audio file ''' Want to either be able to sample above a certain frequency or otherwise get the difference between the two files Otherwise we can overlay the inverse of the original ''' start_time = 0 duration = self.ownership_AudioSegment.duration_seconds semitones = -self.pitch_shift sound1 = AudioSegment.from_file(self.combined_path, start_second=start_time, duration=duration, format="wav") sound1.set_frame_rate(self.higher_sample_rate) original_sound = AudioSegment.from_file(self.original_file, start_second=start_time, duration=duration, format="wav") original_sound.set_frame_rate(self.higher_sample_rate) sound2 = original_sound.invert_phase() #This should cancel out the original audio in our clip combined = sound1.overlay(sound2) #This should be just our pitch shifted ownership audio combined.set_frame_rate(self.higher_sample_rate) wav_data = np.array(combined.get_array_of_samples()) try: audible_ownership = pyrubberband.pitch_shift(wav_data, self.higher_sample_rate, semitones) #This should reverse the original pitch shift temp2 = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) sf.write(temp2.name, audible_ownership, self.higher_sample_rate) wav_to_mp3(temp2.name, './recovered_ownership.mp3', force=True) #Converted to mp3 for convenience temp2.close() except Exception as e: print(e)
def getPitchShiftedRubberbandCQTs(X, Fs, CShape, bins_per_octave, shiftrange=6, GapWins=10): """ Concatenate a bunch of pitch shifted versions of the NSGT CQT of X to each other. Use the rubberband library to get the best results (even though pitch shifting could be done with CQT + Griffin Lim, as with Nakamura) :param X: A 1D array of audio samples :param Fs: Sample rate :param bins_per_octave: Bins per octave in the NSGT :param shiftrange: The number of halfsteps below and above which \ to shift the sound :param GapWins: The length of the gap to include between \ pitch shifted CQTs :returns CRet: The concatenate CQT spectrogram with all pitch shifts """ import pyrubberband as pyrb CRet = np.array([], dtype=np.complex) for shift in range(-shiftrange, shiftrange + 1): Y = pyrb.pitch_shift(X, Fs, shift) thisC = np.zeros(CShape, dtype=np.complex) thisCi = getNSGT(Y, Fs, bins_per_octave) thisC[:, 0:thisCi.shape[1]] = thisCi if CRet.size == 0: CRet = thisC else: Gap = np.zeros((thisC.shape[0], GapWins), dtype=np.complex) CRet = np.concatenate((CRet, Gap, thisC), 1) return CRet
def pitchShift(loop, pitch): y, sr = librosa.load(can_path + loop, sr=44100) sf.write(loop, y, samplerate=44100) #original candidate # pitch shifting (maybe a little difference after shifting) y_shift = pyrb.pitch_shift(y, sr, n_steps=-pitch) sf.write(can_ps_output, y_shift, samplerate=44100) #y_shift = librosa.effects.pitch_shift(y, sr, n_steps=pitch) #by liborsa '''
def rb_pitch(data, sample_rate): """ Pitch Tuning. """ bins_per_octave = 12 pitch_pm = 2 pitch_change = pitch_pm * 2*(np.random.uniform()) data = pitch_shift(data.astype('float64'), sample_rate, n_steps=pitch_change) # , bins_per_octave=bins_per_octave
def __call__(self, wav=None, sr=22050): assert len(wav.shape)==1 if random.random() < self.prob: alpha = self.limit * random.uniform(-1, 1) if self.use_pyrb: wav = pyrb.pitch_shift(wav, sr, alpha) else: wav = librosa.effects.pitch_shift(wav, sr, n_steps=alpha) return {'wav': wav,'sr': sr}
def __call__(self, x, shift=1): """Shift the pitch of given signal Args: x (numpy.ndarray): input signal (n_samples,) shift (float, int): degree of shifting (unit:semitones) Returns: numpy.ndarray: output (n_samples,) """ y = pyrb.pitch_shift(x, self.sample_rate, shift) return y
def convert_to_human_inaudible(self, output_path): #TODO figure out exact semitones to inaudible #TODO figure out how to speed this up (chunk processing) seg = self.ownership_AudioSegment seg.set_frame_rate(self.higher_sample_rate) wav_data = np.array(seg.get_array_of_samples()) semitones = self.pitch_shift try: inaudible_samples = pyrubberband.pitch_shift(wav_data, self.higher_sample_rate, semitones) sf.write(output_path, inaudible_samples, self.original_sample_rate) except Exception as e: print(e)
def do_agumentation(self): no_class = os.listdir(self.input_path) for name in no_class: files = os.listdir(self.input_path + name + "/") files = [f for f in files if f.endswith(".wav")] for i, audio in enumerate(files): print(audio) y, sr = sf.read(self.input_path + name + "/" + audio) time = random.uniform(0.6, 1.3) y_strech = pyrb.time_stretch(y, sr, time) y_agument = pyrb.pitch_shift(y_strech, 22050, 1) # print(y_agument) wav.write(self.output_path + name + "/" + "agumented_" + audio, sr, y_agument) print(name + "/" + "agumented_" + audio, "has augmented and saved")
def pitchshift(folder, filename, shifts=24): """ Pitch shift of the audio file given as input and save in in the folder given as input Args: folder (str): path to folder where to save the pitch shifted audio filename (str): path to audio file to pitch-shift shifts (int, optional): shift to apply. Defaults to 24. """ logger.info("loading audio") audio, orig_sr = librosa.load(filename) audio = librosa.resample(audio, orig_sr, target_sr) logger.info("shifting pitch") root = 40 # e2 is 40 folder = pathlib.Path(folder).absoule() for n_steps in range(0, shifts + 1): # audio_shifted = librosa.effects.pitch_shift(audio, target_sr, n_steps, bins_per_octave=12) audio_shifted = pyrb.pitch_shift(audio, target_sr, n_steps) new_filename = f"{root+n_steps}.wav" new_filepath = folder / new_filename audio_shifted = audio_shifted.astype("float32") write(new_filepath, target_sr, audio_shifted) logger.debug(f"Creating: {new_filename}") logger.debug("==============================") # write("{}{}.wav".format(folder, i+root), y_shift, sr) # write(f'{folder}/{root+i}.wav', sr, y_shift) for n_steps in range(0 - shifts // 2, (shifts // 2) + 1): # amplitude normalization new_filename = f"{root+n_steps}.wav" new_filepath = folder / new_filename sound = AudioSegment.from_file(new_filepath, "wav") normalized_sound = match_target_amplitude(sound, -30.0) # another way # normalized_sound = effects.normalize(sound) normalized_sound.export(new_filepath, format="wav") logger.debug(f"Normalizing: {new_filename}") logger.debug("==============================") logger.info(f"Audio files saved in folder: {folder}")
def test_pitch(sr, num_samples, freq, n_step): y = synth(sr, num_samples, freq) y_s = pyrubberband.pitch_shift(y, sr, n_step) # Make sure we have the same duration assert np.allclose(len(y), len(y_s)) # compare to directly synthesize target track # we'll compare normalized power spectra to avoid phase issues t_freq = freq * 2.0**(n_step / 12.0) y_f = synth(sr, num_samples, t_freq) s_s = np.abs(np.fft.rfft(y_s)) s_f = np.abs(np.fft.rfft(y_f)) assert np.allclose(s_s / s_s[0], s_f / s_f[0], atol=1e-2)
def _set_pitch_shifted_path(audio_object): pitch_semitones = audio_object['pitch_semitones'] file = audio_object['file'] pitch_shifted_path = audio_object['absolute_path'] # pitch shift audio as needed if pitch_semitones: shifted_file = f'PITCH SHIFTED {pitch_semitones}: {Path(file).stem}.wav' pitch_shifted_path = join(PLAYLIST_PATH, shifted_file) # create new pitch shifted file if not exists if not Path(pitch_shifted_path).is_file(): y, sr = librosa.load(audio_object['absolute_path']) print(f'shifting file by {pitch_semitones}: "{file}"') y_shift = pyrubberband.pitch_shift(y, sr, pitch_semitones) soundfile.write(pitch_shifted_path, y_shift, sr) else: print(f'CACHED SHIFT: "{shifted_file}"') audio_object['pitch_shifted_path'] = pitch_shifted_path
def augment(self, array, count): # Original signal # Second argument determines type of aumentation applied to signal self.sigToImage(array, 1, count) # Noise addition using normal distribution with mean = 0 and std =1 # Permissible noise factor value = x > 0.004 noiseAdding = array + 0.009 * np.random.normal(0, 1, len(array)) self.sigToImage(noiseAdding, 2, count) # Permissible factor values = samplingRate / 100 timeShifting = np.roll(array, int(500 / 100)) self.sigToImage(timeShifting, 3, count) # Permissible factor values = -5 <= x <= 5 pitchShifting = pyrb.pitch_shift(array, 500, -3) self.sigToImage(pitchShifting, 4, count) # Permissible factor values = 0 < x < 1.0 factor = 0.95 # Yields the best reults without losing ecg wave shape timeStretching = pyrb.time_stretch(array, 500, factor) self.sigToImage(timeStretching, 5, count)
def audio(mudabox, state): mudabox._audio['y'] = pyrb.pitch_shift(mudabox._audio['y'], mudabox._audio['sr'], state['n_semitones'])
def testPitchShift(X, Fs, W, H, shift, filename): W = 2048 H = 128 S = np.abs(STFT(X, W, H)) S = pitchShiftSTFT(S, Fs, shift) X2 = griffinLimInverse(S, W, H, 20) wavfile.write(filename, Fs, X2) if __name__ == '__main__': import librosa import pyrubberband as pyrb X, Fs = librosa.load("music/Beatles_LetItBe.mp3", sr=44100) print("Fs = ", Fs) shift = 2 noctaves = 7 y = pyrb.pitch_shift(X, Fs, shift) wavfile.write("rubberbandshift%i.wav"%shift, Fs, y) testPitchShift(X, Fs, 2048, 128, shift, "gfshift%i_stft.wav"%shift) from NMF import shiftMatLRUD winSize = 4096 hopSize = 256 S = STFT(X, winSize, hopSize) S = np.abs(S) M = warpSTFTMel(S, Fs, winSize) M = shiftMatLRUD(M, di=shift*2) S = unwarpSTFTMel(M, Fs, winSize) y = griffinLimInverse(S, winSize, hopSize) y = y/np.max(np.abs(y)) wavfile.write("melshift%i.wav"%shift, Fs, y)
mode="same") x_res[safe_index] += inv print("filter cost {}".format(time.time() - start_time)) return x_res, x_glottal_res, recons_psds, recons_vt_psds x_res, x_glottal_res, recons_psds, recons_vt_psds = inverse_lpc_fftconvolve( x, dat) wavwrite('x_res.wav', fs, (x_res * 2**15).astype(np.int16)) wavwrite('x_glottal_res.wav', fs, (x_glottal_res * 2**15).astype(np.int16)) shift = -12 import pyrubberband as pyrb x_res = pyrb.pitch_shift(x_res, fs, shift) y = synthesisRequiem.get_waveform(x_res, np.transpose(recons_psds, [1, 0]), dat['temporal_positions'], dat['f0'], dat['fs']) x_glottal_res = pyrb.pitch_shift(x_glottal_res, fs, shift) y_from_glottal = synthesisRequiem.get_waveform( x_glottal_res, np.transpose(recons_vt_psds, [1, 0]), dat['temporal_positions'], dat['f0'], dat['fs']) wavwrite('x_recons.wav', fs, (y * 2**15).astype(np.int16)) wavwrite('x_recons_glottal.wav', fs, (y_from_glottal * 2**15).astype(np.int16)) from scipy import interpolate import copy
import wave import sys from pydub import AudioSegment import soundfile as sf import pyrubberband as pyrb # sound = AudioSegment.from_mp3(sys.argv[1]) # sound.export("file.wav", format="wav") y, sr = sf.read("0.wav") y_stretch = pyrb.time_stretch(y, sr, 0.90) y_shift = pyrb.pitch_shift(y, sr, 0.90) sf.write("analyzed_filepathX5.wav", y_stretch, sr, format='wav')
def pitch_shifting(sig, sr, degree): return pyrb.pitch_shift(sig, sr, degree)
def pitchshift(self, n): note = n - self.startNote sound = np.int16( prb.pitch_shift(self.sample, self.rate, note) * (2**15)) return sound
y = y[:3 * 44100] wn = np.clip((np.random.rand(len(y)) * 2 - 1) * 0.01, -1, 1) y_wn = np.clip(y + wn, -1, 1) gain_float = gain = 10**(6 / 20) y_gain = np.clip(y * gain_float, -1, 1) B, A = signal.butter(5, 5000 / (44100 / 2), btype='lowpass') y_lp = signal.lfilter(B, A, y, axis=0) B, A = signal.butter(5, 10000 / (44100 / 2), btype='highpass') y_hp = signal.lfilter(B, A, y, axis=0) y_stretch = pyrb.time_stretch(y, 44100, 0.7)[:3 * 44100] y_pitch = pyrb.pitch_shift(y, 44100, -5) fig, (ax1, ax2, ax3) = plt.subplots(ncols=2, nrows=3) fig.tight_layout(pad=3) Pxx, freqs, bins, im = ax1[0].specgram(y, NFFT=1024, Fs=44100, noverlap=900, scale_by_freq=True, detrend="mean") ax1[0].set_title('Original') ax1[0].set(ylabel='Frequency (Hz)') ax1[0].xaxis.set_ticks([]) ax1[0].yaxis.set_ticks([0, 10_000, 20_000]) ax1[0].yaxis.set_ticklabels(["0", "10k", "20k"]) Pxx, freqs, bins, im = ax1[1].specgram(y_wn,
def __init__( self, files, limit=None, augment=False, duplicate=1, seed="42", device=torch.device("cpu"), ): self.device = device random = Random(seed) # init random generator pos_prep = PositionalEncodingLabeler(POS_DIM, scale=POS_SCALE) self.counts = [] base = self.base if limit is not None: files = files[:limit] inp, out_vec, out_int, out_map, out_dur, out_trans, out_trans_ints = ( [], [], [], [], [], [], [], ) position, border, weight, inp_mfcc = [], [], [], [] duplicate_set = set() self.files = [] def stack(arr, tensor): return [tensor(a).to(self.device) for a in arr] for i, (label_file, audio_file) in enumerate(files * duplicate): assert self.get_name(label_file) == self.get_name(audio_file) a, b, c = self.get_name(label_file) f"{a}_{b}_{c}_{i}" label_file = os.path.join(base, "data", label_file) audio_file = os.path.join(base, "data", audio_file) def loader(): return AudioCaching.load(audio_file) audio = self.get_set(audio_file, loader) audio_scaling, rate = 32768.0 / 512, 16000 audio_base_len = len(audio) meter = pyln.Meter(rate) # create BS.1770 meter loudness = meter.integrated_loudness(audio) audio = pyln.normalize.loudness(audio, loudness, -40.0) stretch = 1 pure_key = (audio_file, "pure_key") if pure_key not in duplicate_set: duplicate_set.add(pure_key) elif augment: # pitch = random.choice([-6, -4, -1, 1, 4, 6]) # stretch = random.choice([0.85, 0.9, 0.95, 1.05, 1.1, 1.15]) # pitch = random.choice([-4, -1, 1, 4]) pitch = random.choice([-1, 0, 1]) stretch = random.choice([0.9, 0.95, 1.05, 1.1]) key_stretch = "time_stretch", stretch key_pitch = "pitch_shift", pitch, stretch duplication_key = (audio_file, key_pitch) if duplication_key in duplicate_set: continue duplicate_set.add(duplication_key) def audio_pitch_shift(): return pyrb.pitch_shift(audio, rate, pitch) cache_audio = AudioCaching.get(audio_file, key_pitch) got_pitch = cache_audio is not None cache_audio = cache_audio if got_pitch else AudioCaching.get(audio_file, key_stretch) got_stretch = cache_audio is not None audio = cache_audio if cache_audio is not None else audio if not got_stretch: audio = pyrb.time_stretch(audio, rate, stretch) AudioCaching.set(audio_file, key_stretch, audio) if not got_pitch: audio = pyrb.pitch_shift(audio, rate, pitch) AudioCaching.set(audio_file, key_pitch, audio) stretch = len(audio) / audio_base_len fbank_feat = logfbank( audio, rate, winlen=WIN_SIZE, winstep=WIN_STEP, nfilt=INPUT_SIZE, ) # TODO: remove scaling mfcc_feat = mfcc( audio, rate, winlen=WIN_SIZE, winstep=WIN_STEP, nfilt=32, numcep=16, ) # TODO: remove scaling # some audio instances are too short for the audio transcription # and the winlen cut :( fbank_feat = np.vstack([fbank_feat] + [fbank_feat[-1]] * 10) mfcc_feat = np.vstack([mfcc_feat] + [mfcc_feat[-1]] * 10) step_size = WIN_STEP * 1000 with open(label_file) as f: lines = list(f.readlines()) length = fbank_feat.shape[0] length_ms = length * step_size labels = [] ms_samples = 16 for line in lines: _, end, tag = line.split() end_ms = float(end) / ms_samples * stretch end_ms = min(end_ms, length_ms) labels.append((end_ms, tag)) length = int(end_ms / step_size) (tag_ints, tag_vecs, tag_mapping, transcription, transcription_ints) = self.process_audio( labels, length, step_size, ) fbank_feat = fbank_feat[: len(tag_ints)] mfcc_feat = mfcc_feat[: len(tag_ints)] length = fbank_feat.shape[0] length_ms = length * step_size w = [200.0 / FOUND_LABELS[KNOWN_LABELS[_pid]] for _pid, _ms in tag_mapping] if i % 150 == 0: print(i) gc.collect() if length == len(tag_vecs) and length == len(tag_ints): original = stack([tag_vecs], torch.FloatTensor)[0].cpu().numpy() original_ids = np.argmax(original, axis=1) if MERGE_DOUBLES: a, b, diff = find_borders(original_ids, tag_mapping) d = abs(diff).max() if d > 15: print( f"[DIFF-ERROR] diff is bigger {d} > 15", np.where(abs(diff) > 15), diff.shape, ) print("\t", tag_mapping[-1], length_ms) print("\t", np.round(a[-5:], 0)) print("\t", np.round(b[-5:], 0)) continue self.counts.append(length) tag_duration = [] start = 0 for _, end_ms in tag_mapping: end_time = end_ms / DURATION_SCALER tag_duration.append(end_time - start) start = end_time # CUMSUM vs DURATION pos, bor = pos_prep(torch.FloatTensor(tag_duration[:-1]).to(device)) position.append(pos) border.append(bor) weight.append(w) out_dur.append(tag_duration) inp.append(fbank_feat) inp_mfcc.append(mfcc_feat) out_vec.append(tag_vecs) out_int.append(tag_ints) out_trans.append(transcription) out_trans_ints.append(transcription_ints) out_map.append(tag_mapping) self.files.append((label_file, audio_file)) else: print( f"[ERROR] len not match {length} != {len(tag_vecs)} != {len(tag_ints)} \n\t - {label_file}\n\t - {audio_file}", ) self.inp = stack(inp, torch.FloatTensor) self.inp_mfcc = stack(inp_mfcc, torch.FloatTensor) self.out_vec = stack(out_vec, torch.FloatTensor) self.out_int = stack(out_int, torch.LongTensor) self.transcription = stack(out_trans, torch.FloatTensor) self.transcription_int = stack(out_trans_ints, torch.LongTensor) self.out_map = out_map self.out_duration = stack(out_dur, torch.FloatTensor) self.in_transcription = stack(out_trans, torch.FloatTensor) self.key = [uuid.uuid4().urn for i in range(len(inp))] self.position = position self.border = border self.weight = stack(weight, torch.FloatTensor) FEATURES = RawField(postprocessing=self.features_batch_process) FEATURES_MFCC = RawField(postprocessing=self.features_batch_process) LABEL = RawField(postprocessing=self.features_batch_process) TRANSCRIPTION_INT = RawField(postprocessing=self.features_batch_process) TRANSCRIPTION = RawField() LABEL_VEC = RawField() OUT_MAP = RawField() OUT_DUR = RawField(postprocessing=self.features_batch_process) IN_TRANS = RawField(postprocessing=self.features_batch_process) INDEX = RawField() KEY = RawField() POSITION = RawField(postprocessing=self.features_batch_process) BORDER = RawField(postprocessing=self.features_batch_process) WEIGHT = RawField(postprocessing=self.features_batch_process) setattr(FEATURES, "is_target", False) setattr(FEATURES_MFCC, "is_target", False) setattr(LABEL_VEC, "is_target", False) setattr(OUT_MAP, "is_target", False) setattr(TRANSCRIPTION, "is_target", False) setattr(TRANSCRIPTION_INT, "is_target", False) setattr(OUT_DUR, "is_target", False) setattr(IN_TRANS, "is_target", False) setattr(LABEL, "is_target", True) setattr(INDEX, "is_target", False) setattr(KEY, "is_target", False) setattr(POSITION, "is_target", False) setattr(BORDER, "is_target", False) setattr(WEIGHT, "is_target", False) self.fields = { "features": FEATURES, "features_mfcc": FEATURES_MFCC, "labels": LABEL, "transcription": TRANSCRIPTION, "transcription_int": TRANSCRIPTION_INT, "label_vec": LABEL_VEC, "out_map": OUT_MAP, "out_duration": OUT_DUR, "in_transcription": IN_TRANS, "index": INDEX, "key": KEY, "position": POSITION, "border": BORDER, "weight": WEIGHT, }
def audio_pitch_shift(): return pyrb.pitch_shift(audio, rate, pitch)
def shift_audio(self, semitones, data, sr): new_data = pyrb.pitch_shift(data, sr, semitones) return new_data
for meta in tqdm( open('ProsodyLabeling/000001-010000.txt', mode='r', encoding='utf8').readlines()): line = meta.strip().split('\t') if len(line) == 1: continue file_name = line[0] file_text = line[1] audio_path = os.path.join('Wave', file_name + '.wav') shutil.copyfile(os.path.join('Wave', file_name + '.wav'), os.path.join('sample', file_name + '.wav')) print(file_name + '\t' + file_text + '\n', file=meta_new) for j, i in enumerate(range(len(pitch_list) - 1)): pitch_num = random.uniform(pitch_list[i], pitch_list[i + 1]) # print(pitch_num) new_file_name = file_name + '-' + str(j) + '.wav' y, sr = librosa.load(audio_path, sr=None) y_stretched = pyrubberband.pitch_shift(y, sr, pitch_num) # print(os.path.join('sample', new_file_name)) if y.shape != y_stretched.shape: print('test') sf.write(os.path.join('sample', new_file_name), y_stretched, sr, format='wav') print(file_name + '-' + str(j) + '\t' + file_text + '\n', file=meta_new) # exit() pass
def audio(mudabox, state): """Deform the audio""" mudabox._audio["y"] = pyrb.pitch_shift(mudabox._audio["y"], mudabox._audio["sr"], state["n_semitones"])
def generate_labels_features_voca(self, all_list): pid = os.getpid() mp3_config, feature_config, mp3_str, feature_str = self.config_to_folder( ) i = 0 # number of songs j = 0 # number of impossible songs k = 0 # number of tried songs total = 0 # number of generated instances stretch_factors = [1.0] shift_factors = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6] loop_broken = False for song_name, lab_path, mp3_path, save_path in all_list: save_path = save_path + '_voca' # different song initialization if loop_broken: loop_broken = False i += 1 print(pid, "generating features from ...", os.path.join(mp3_path)) if i % 10 == 0: print(i, ' th song') original_wav, sr = librosa.load(os.path.join(mp3_path), sr=mp3_config['song_hz']) # save_path, mp3_string, feature_string, song_name, aug.pt result_path = os.path.join(save_path, mp3_str, feature_str, song_name.strip()) if not os.path.exists(result_path): os.makedirs(result_path) # calculate result for stretch_factor in stretch_factors: if loop_broken: loop_broken = False break for shift_factor in shift_factors: # for filename idx = 0 try: chord_info = self.Chord_class.get_converted_chord_voca( os.path.join(lab_path)) except Exception as e: print(e) print(pid, " chord lab file error : %s" % song_name) loop_broken = True j += 1 break k += 1 # stretch original sound and chord info x = pyrb.time_stretch(original_wav, sr, stretch_factor) x = pyrb.pitch_shift(x, sr, shift_factor) audio_length = x.shape[0] chord_info[ 'start'] = chord_info['start'] * 1 / stretch_factor chord_info['end'] = chord_info['end'] * 1 / stretch_factor last_sec = chord_info.iloc[-1]['end'] last_sec_hz = int(last_sec * mp3_config['song_hz']) if audio_length + mp3_config['skip_interval'] < last_sec_hz: print('loaded song is too short :', song_name) loop_broken = True j += 1 break elif audio_length > last_sec_hz: x = x[:last_sec_hz] origin_length = last_sec_hz origin_length_in_sec = origin_length / mp3_config['song_hz'] current_start_second = 0 # get chord list between current_start_second and current+song_length while current_start_second + mp3_config[ 'inst_len'] < origin_length_in_sec: inst_start_sec = current_start_second curSec = current_start_second chord_list = [] # extract chord per 1/self.time_interval while curSec < inst_start_sec + mp3_config['inst_len']: try: available_chords = chord_info.loc[ (chord_info['start'] <= curSec) & (chord_info['end'] > curSec + self.time_interval)].copy() if len(available_chords) == 0: available_chords = chord_info.loc[( (chord_info['start'] >= curSec) & (chord_info['start'] <= curSec + self.time_interval)) | ( (chord_info['end'] >= curSec) & (chord_info['end'] <= curSec + self.time_interval))].copy() if len(available_chords) == 1: chord = available_chords['chord_id'].iloc[ 0] elif len(available_chords) > 1: max_starts = available_chords.apply( lambda row: max(row['start'], curSec), axis=1) available_chords['max_start'] = max_starts min_ends = available_chords.apply( lambda row: min( row.end, curSec + self. time_interval), axis=1) available_chords['min_end'] = min_ends chords_lengths = available_chords[ 'min_end'] - available_chords[ 'max_start'] available_chords[ 'chord_length'] = chords_lengths chord = available_chords.loc[ available_chords['chord_length']. idxmax()]['chord_id'] else: chord = 169 except Exception as e: chord = 169 print(e) print(pid, "no chord") raise RuntimeError() finally: # convert chord by shift factor if chord != 169 and chord != 168: chord += shift_factor * 14 chord = chord % 168 chord_list.append(chord) curSec += self.time_interval if len(chord_list ) == self.no_of_chord_datapoints_per_sequence: try: sequence_start_time = current_start_second sequence_end_time = current_start_second + mp3_config[ 'inst_len'] start_index = int(sequence_start_time * mp3_config['song_hz']) end_index = int(sequence_end_time * mp3_config['song_hz']) song_seq = x[start_index:end_index] etc = '%.1f_%.1f' % (current_start_second, current_start_second + mp3_config['inst_len']) aug = '%.2f_%i' % (stretch_factor, shift_factor) if self.feature_name == FeatureTypes.cqt: feature = librosa.cqt( song_seq, sr=sr, n_bins=feature_config['n_bins'], bins_per_octave=feature_config[ 'bins_per_octave'], hop_length=feature_config['hop_length'] ) else: raise NotImplementedError if feature.shape[ 1] > self.no_of_chord_datapoints_per_sequence: feature = feature[:, :self. no_of_chord_datapoints_per_sequence] if feature.shape[ 1] != self.no_of_chord_datapoints_per_sequence: print( 'loaded features length is too short :', song_name) loop_broken = True j += 1 break result = { 'feature': feature, 'chord': chord_list, 'etc': etc } # save_path, mp3_string, feature_string, song_name, aug.pt filename = aug + "_" + str(idx) + ".pt" torch.save(result, os.path.join(result_path, filename)) idx += 1 total += 1 except Exception as e: print(e) print(pid, "feature error") raise RuntimeError() else: print( "invalid number of chord datapoints in sequence :", len(chord_list)) current_start_second += mp3_config['skip_interval'] print(pid, "total instances: %d" % total)
def shift(audio, amount): return pyrb.pitch_shift(audio, 16000, amount)
def write_audio_file(path, name, voice, audio, sampling_rate): file_name = path + \ time.strftime("%Y%m%d-%H%M%S_") + name + str(randint(0, 100)) + ".wav" if voice == "satan:": temp_file_name = path + "temp.wav" write(temp_file_name, sampling_rate, audio) fixed_framerate = 11000 sound = AudioSegment.from_file(temp_file_name) sound = sound.set_frame_rate(fixed_framerate) write(file_name, fixed_framerate, audio) y, sr = sf.read(file_name) y_stretch = pyrb.time_stretch(y, sr, 1.6) y_shift = pyrb.pitch_shift(y, sr, 1.6) sf.write(file_name, y_stretch, sr, format='wav') sound = AudioSegment.from_wav(file_name) sound.export(file_name, format="wav") elif voice == "vader:": temp_file_name = path + "temp.wav" write(temp_file_name, sampling_rate, audio) AudioEffect.robotic(temp_file_name, file_name) y, sr = sf.read(file_name) y_stretch = pyrb.time_stretch(y, sr, 0.9) y_shift = pyrb.pitch_shift(y, sr, 0.9) sf.write(file_name, y_stretch, sr, format='wav') sound = AudioSegment.from_wav(file_name) sound.export(file_name, format="wav") else: write(file_name, sampling_rate, audio) return file_name