def edit_sounds(path, pitch, length): sr = 44100 output_path = get_random_name('wav') speed = HIGHLIGHT_LENGTH / length y, sr = librosa.load(path, sr=sr) write_wav(output_path, time_stretch(pitch_shift(y, sr, pitch), speed), sr) return output_path
def SavespecArg(y_mix, y_inst, fname, shift, stretch): Savespec(y_mix, y_inst, fname) for sh in shift: y_mix_shift = pitch_shift(y_mix, C.SR, sh) y_inst_shift = pitch_shift(y_inst, C.SR, sh) Savespec(y_mix_shift, y_inst_shift, "%s_shift%d" % (fname, sh)) y_mix_shift = pitch_shift(y_mix, C.SR, -sh) y_inst_shift = pitch_shift(y_inst, C.SR, -sh) Savespec(y_mix_shift, y_inst_shift, "%s_shift-%d" % (fname, sh)) for st in stretch: y_mix_stretch = time_stretch(y_mix, st) y_inst_stretch = time_stretch(y_inst, st) Savespec(y_mix_stretch, y_inst_stretch, "%s_stretch%d" % (fname, int(st * 10)))
def autotune(self, audio): """because the singers can't sing in tune for their livs""" for part_idx, (singer, part) in enumerate(self.song.items()): for section_idx in range(self.section_size): i = part_idx * self.section_size + section_idx print('Autotuning %s #%d...' % (singer, section_idx)) #write that audio to disk and pitch detect audio_path = os.path.join( self.save_path, 'tmp', 'raw' + self.song_name + '_' + singer + '.wav') pitch_path = os.path.join( self.save_path, 'tmp', 'raw' + self.song_name + '_' + singer + '.f0') reaper_path = os.path.join('.', 'REAPER', 'build', 'reaper') soundfile.write(audio_path, audio[i], self.FS) subprocess.check_output( [reaper_path, '-i', audio_path, '-f', pitch_path, '-a']) with open(pitch_path, 'r') as f: reaper_output = f.read() reaper_output = reaper_output.split('\n')[ 7:-1] #skip header information time, voice, f0 = [], [], [] for line in reaper_output: t, v, f = line.split(' ') time.append(float(t)) voice.append(int(v)) f0.append(float(f)) # start = 0 reaper_sample_rate = time[1] - time[0] time0 = 0 #current time time1 = 0 # for note in part: time1 = time0 + note['duration'] if note['volume'] != 0: #compute pitch at that interval start = int(time0 / reaper_sample_rate) stop = int(time1 / reaper_sample_rate) actual_pitch = np.asarray([ f for i, f in enumerate(f0[start:stop]) if voice[i] == 1 ]).mean() desired_pitch = note['pitch'] semitone_shift = self.semitone_diff( actual_pitch, desired_pitch) #perform autotuning start = int(time0 * self.FS) stop = int(time1 * self.FS) audio[i, start:stop] = pitch_shift( audio[i, start:stop], self.FS, semitone_shift) time0 = time1 return audio
def pitch_shifting(self, sample): levels = [-2, -1, 1, 2, None] pitch_target = levels[randint(0, len(levels)-1)] if pitch_target is None: return sample else: return np.int16(pitch_shift(sample.astype(float), 16000, n_steps = pitch_target))
def get_data(train=True): batch_out = [] interval = int(self.clip_sec * self.samplerate) * 2 for batch_idx in range(self.batch_size): if train: rec_idx = np.random.randint(len(train_data_wav)) crop_idx = np.random.randint(len(train_data_wav[rec_idx][0]) - interval) sources = [i[crop_idx:crop_idx + interval] for i in train_data_wav[rec_idx]] else: rec_idx = np.random.randint(len(test_data_wav)) crop_idx = np.random.randint(len(test_data_wav[rec_idx][0]) - interval) sources = [i[crop_idx:crop_idx + interval] for i in test_data_wav[rec_idx]] if config.pitch_aug and train: n_steps = pitch_shift_list[np.random.randint(len(pitch_shift_list))] if not n_steps==0: sources = [pitch_shift(i, self.samplerate, n_steps=n_steps) for i in sources] sources = [from_polar(to_stft(i, self.nfft)) for i in sources] if config.bpm_aug and train: rate = stretch_rate_list[np.random.randint(len(stretch_rate_list))] if not rate==1.0: for i in range(len(sources)): augmented = phase_vocoder(sources[i][:, :, 0] + 1j * sources[i][:, :, 1], rate=rate) sources[i] = np.array([np.real(augmented), np.imag(augmented)]).transpose(1, 2, 0) if config.amp_aug and train: sources = [i * (0.75 + (np.random.random() * 0.5)) for i in sources] sources = random_crop(sources, self.ydim) batch_out.append(sources) batch_out = np.array(batch_out).transpose(1, 0, 2, 3, 4) if train and true_wp(config.shuffle_sources_aug_prob) == 1.0: for source_i in range(self.num_sources): np.random.shuffle(batch_out[source_i]) return batch_out
def change_pitch(self): bins_per_octave = 24 pitch_pm = 4 pitch_change = pitch_pm * 2 * (np.random.uniform() - 0.5) self.X = pitch_shift(self.X, self.sr, n_steps=pitch_change, bins_per_octave=bins_per_octave)
def shift_pitch(wave: np.array, sr: int, pitch_step: int = 5) -> np.array: """ @topic: Change the pitch of audio wave by given speed_factor. @inpit: wave: audio wave, sr: sampling rate, pitch_step: the step of pitch shift. @return: wave_sp: pitch shifted audio wave. """ wave_sp = pitch_shift(wave, sr, pitch_step) return wave_sp
def generate_pitch_shifting_augmentation(target_directory, input_directory): audio_file_extension = ".wav" if not os.path.exists(target_directory): os.makedirs(target_directory) for root, dirs, files in os.walk(input_directory): for file in files: if file.endswith(audio_file_extension): audio_path = os.path.join(root, file) signal, sample_rate = librosa.load(audio_path, sr=None) librosa.output.write_wav( target_directory + '/' + 'lower' + file, pitch_shift(signal, sample_rate, -2), sample_rate) librosa.output.write_wav(target_directory + '/' + 'higher', pitch_shift(signal, sample_rate, 2), sample_rate)
def LoadAudio_Arg(fname, pitch_shift, time_stretch): y, sr = load(fname, sr=C.SR) if sr != C.SR: y = resample(y, sr, C.SR) y = pitch_shift(y, C.SR, pitch_shift) y = time_stretch(y, time_stretch) spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase
def pitch_shift_wavfile(wav, sr, n_octaves): peak = max(np.abs(np.max(wav)), np.abs(np.min(wav))) if n_octaves == 0: return wav new_wav = pitch_shift(y=wav.astype(float), sr=sr, n_steps=n_octaves * 12, bins_per_octave=12) new_peak = max(np.abs(np.max(new_wav)), np.abs(np.min(new_wav))) new_wav = peak * new_wav / new_peak return new_wav
def create_fetures(audio_path, aug_index): waveform, sample_rate = torchaudio.load(audio_path) if aug_index == 1: waveform = waveform + torch.randn(waveform.shape) * 0.001 if aug_index == 2: waveform = torchaudio.functional.contrast(waveform) if aug_index == 3: if waveform.shape[0] == 2: waveform = torch.cat([ torch.tensor( pitch_shift( waveform.squeeze(0).numpy()[0, :], sample_rate, np.random.random(1)[0])).unsqueeze(0), torch.tensor( pitch_shift( waveform.squeeze(0).numpy()[1, :], sample_rate, np.random.random(1)[0])).unsqueeze(0) ], dim=0) else: waveform = torch.tensor( pitch_shift( waveform.squeeze(0).numpy(), sample_rate, np.random.random(1)[0])).unsqueeze(0) if aug_index == 4: waveform = drc(waveform, bitdepth=6) if aug_index == 5 and '-a' in audio_path: with open(FILTER_RESPONSES.replace('s.', f's{random.randint(1, 6)}.'), 'rb') as handle: h = pickle.load(handle) waveform = torch.tensor( np.convolve(waveform.squeeze(0).numpy(), h, 'same')).unsqueeze(0) if waveform.shape[0] == 2: full_mel_3d = torch.cat([ create_mels_deltas(waveform[0], sample_rate), create_mels_deltas(waveform[1], sample_rate) ], dim=0) else: full_mel_3d = create_mels_deltas(waveform, sample_rate) return full_mel_3d
def pitch_shift(self, n_steps, **kwargs): """ Shift the pitch of the audio time series and return the new object. This method is a wrapper over librosa_'s ``pitch_shift`` method. .. _librosa: https://librosa.org """ new = self.copy() new.data = pitch_shift(new.data, new.fs, n_steps, **kwargs) return new
def apply(self, waveform, **params): assert waveform.shape[0] == 1, 'waveform should have 1-channel' assert waveform.shape[1] > 0, 'waveform is empty' waveform = waveform.clone() n_steps = np.random.randint( self.min_steps, self.max_steps ) # n_steps < 0 -- shift down, n_steps > 0 -- shift up waveform = pitch_shift(waveform[0].numpy(), self.sr, n_steps=n_steps, bins_per_octave=12) return torch.tensor(waveform, dtype=torch.float).unsqueeze(0)
def pitch_shifting(self, sample): """ Modify the pitch of the audio ref: https://arxiv.org/pdf/1608.04363.pdf """ levels = [-2, -1, 1, 2, None] pitch_target = levels[randint(0, len(levels) - 1)] if pitch_target is None: return sample else: return np.int16( pitch_shift(sample.astype(float), SEQ_LENGTH, n_steps=pitch_target))
def speed_up(video_clip, speed): rate = 44100 # Speed up video video_clip = video_clip.speedx(speed) # Determine pitch shift from speed shift = (1 - speed if speed >= 1 else (1 / speed) - 1) * 12 # Fix audio pitch audio = video_clip.audio.to_soundarray(fps=rate).transpose() for i, channel in enumerate(audio): audio[i] = pitch_shift(channel, rate, shift) audio = audio.transpose() video_clip.audio = AudioArrayClip(audio, fps=rate) return video_clip
def increasePitch(path, fileName, outputDir): """Increased a video segment's pitch by one semitone by extracting its audio track and saving it as a temporary external file Args: path (string): The path to the current video file fileName (string): The filename for the temporary audio file outputDir (string): The output directory """ # Import audio from video print('Extracting audio from: ' + path) audio, sr = librosa.load(path) # Increase its pitch increasedPitch = pitch_shift(audio, sr, 1) # Save the .wav print('Saving audio .wav to: ' + outputDir + fileName) librosa.output.write_wav(outputDir + fileName + '.wav', increasedPitch, sr) # Convert the .wav to .mp3 print('Converting to .mp3 from: ' + outputDir + fileName) mp3 = AudioSegment.from_wav(outputDir + fileName + '.wav') # Save the .mp3 print('Saving .mp3 to: ' + outputDir + fileName) mp3.export(outputDir + fileName + '.mp3', 'mp3')
wf += np.abs(0.2+np.cos(p2*4*t))*np.cos(p2*120*t) sd.play(wf, fs, device=8) swrite('swav.wav', fs, wf) plt.figure() plt.plot(t, wf) plt.show(block=False) sd.play(10*np.sin(2*3.14159*220*(2**(1/12))*t), fs, device=8) 440*2**(1/12)440*2**(1/12) sd.play(100*np.sin(2*3.14159*440*(2**(3/12))*t), fs, device=8) wf = 10*np.sin(2*3.14159*220*(2**(1/12))*t) wfps = pitch_shift(wf, fs, n_steps=0.5) sd.play(wf, fs) sd.play(wfps, fs) mr = sd.rec(fs*dur, fs, channels=1, dtype='float64', device=1) mr = mr.squeeze() mrps = pitch_shift(mr, fs, n_steps=-5) sd.play(10*mrps, fs) sd.query_devices() mrps.shape mrps.dtype np.amin(mrps)
def pitch_shift(x, sr=16000, n_steps=15): return effects.pitch_shift(x, sr, torch.randint(low=-n_steps, high=n_steps, size=[1]).item())
e_dissonance_dict = {} audio_target = std.MonoLoader(filename=target)() print("Target:", target) print("======================") for candidate in filelist: print("Candidate:", candidate) audio_candidate = std.MonoLoader(filename=candidate)() harmonicities = np.zeros(12) inharmonicities = np.zeros(12) dissonances = np.zeros(12) for pshift in range(12): if pshift == 0: mod_candidate = audio_candidate else: if pshift > 5: mod_candidate = pitch_shift(audio_candidate, 44100, pshift - 12).astype(np.float32) else: mod_candidate = pitch_shift(audio_candidate, 44100, pshift).astype(np.float32) mix_audio = mix(audio_target, mod_candidate, 44100) spf, mpf = utils.get_sines_per_frame(mix_audio, 44100) hpf, hpm = utils.get_hpeaks_per_frame(mix_audio, 44100) pcs = transform_to_pc(spf) n_frames = pcs.shape[0] har_frame = np.zeros(n_frames) inh_frame = np.zeros(n_frames) dis_frame = np.zeros(n_frames) for i in range(n_frames): # Peter's harmonicity part mspec = milne_pc_spectrum(pcs[i]) har_frame[i] = ph_harmon(mspec)
from os import path import numpy as np import soundfile as sf from audio_helpers import play_audio from librosa.effects import pitch_shift from librosa.core import load from librosa.output import write_wav DIR = 'data/wave' fn = '2.wav' base_name, ext = path.splitext(fn) FN = path.join(DIR, fn) FN_NEW = path.join(DIR, '{}_shifted{}'.format(base_name, ext)) x, fs = load(FN) print "Script loaded file with fs {}".format(fs) def to_pcm(x): max_val = np.iinfo(np.int16).max return (x * max_val).astype(np.int16) shifted = pitch_shift(x, fs, 2) sf.write(FN_NEW, shifted, fs, subtype="PCM_24") play_audio(FN_NEW)
starting_times = (9, 25, 40, 52, 65, 77, 89, 121, 137, 157, 170, 184) # times in seconds when each note starts in the input video length = 8 # length of the generated clips octaves = (0, 9) # range of octaves resolution = (600, 800) # video resolution of the outputs notes = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#'] # name of the notes src_clips = [VideoFileClip(input_file).\ subclip(start, start + length).\ volumex(0.6) for start in starting_times] # this list contains the clips for each note in the input video for n in range(octaves[0] * 12, octaves[1] * 12): note_name = f"{n // 12}{notes[(n % 12)]}" clip = src_clips[(n - 3) % 12].copy() # the '-3' is because notes starts with 'C' but during the generation they starts from 'A' trans = (((n - 3) // 12) - 5) * 12 # transposition offset clip.audio = AudioArrayClip( np.transpose( np.stack([ pitch_shift( # this function does the trick np.transpose(clip.audio.to_soundarray())[channel], 48000, # it's the sampling of the source video n_steps=trans) for channel in (0, 1) ])), # (0, 1) because the source is stereo fps=48000) clip.write_videofile(f"{note_name}.mp4")
def create_dataset_for_one_song( song_name, wav_path, y, sr, k, idx, npy_dataset_name, crop_size, beat_dir_path, ): shifts = [-12, -6, 0, 6, 12] for shift in tqdm(shifts, desc=song_name): shifted_y = y if shift == 0 else pitch_shift(y, sr, n_steps=shift) save_name1 = song_name if shift == 0 else f"{song_name}shift{shift}" for stretch_i in range(5): # stretch 5 times randomly if stretch_i == 0: save_name2 = f"{save_name1}original" if shift == 0 else save_name1 beat_path = f"{beat_dir_path}{song_name}.BEAT.TXT" melspec = convertAudio2MelSpec(wav_path) # 100Hz activation, downbeats = convertBeatText2Activation( beat_path, song_length=len(melspec), units="ms" ) bpm = convertBeatText2Bpm(beat_path, len(melspec)) max_bpm = np.max(bpm) else: max_rate = 300 / (max_bpm + 10) stretch_rates = [None, 0.5, 0.75, max_rate / 2, max_rate - 0.01] # stretch_rate = random.choice(np.arange(0.5, max_rate, 0.05)) stretch_rate = stretch_rates[stretch_i] rounded_rate = np.round(stretch_rate, decimals=2) save_name2 = f"{save_name1}stretch{stretch_i}x{rounded_rate}" stretched_y = time_stretch(shifted_y, stretch_rate) melspec = convertAudio2MelSpec(None, True, stretched_y, sr) # 100Hz stretched_activation = stretch_beat(activation, stretch_rate) stretched_bpm = stretch_bpm(bpm, stretch_rate) stretched_downbeats = (np.rint(downbeats / stretch_rate)).astype( np.int64 ) beattheta = activation2beattheta( activation if stretch_i == 0 else stretched_activation ) bartheta = activation2bartheta( activation if stretch_i == 0 else stretched_activation, downbeats if stretch_i == 0 else stretched_downbeats, ) assert np.max(bartheta) > 0.5 assert np.max(beattheta) > 0.5 features = [ ["melspec", melspec], ["activation", activation if stretch_i == 0 else stretched_activation], ["bpm", bpm if stretch_i == 0 else stretched_bpm], ["beattheta", beattheta], ["downbeattheta", bartheta], ] for feature in features: if feature[0] == "bpm": feature[1][feature[1] > 300] = 300 feature[1] = np.ascontiguousarray(feature[1]) feature_length = len(feature[1]) cropped_features = ( [feature[1]] if feature_length < crop_size else librosa.util.frame( x=feature[1], frame_length=crop_size, hop_length=crop_size, axis=0, ) ) for index, cropped_feature in enumerate(cropped_features): cropped_feature[cropped_feature < 0] = 0 fname = feature[0] if fname == "activation": fname = "beat" elif fname == "melspec": fname = f"melspec_sr{sr}_nfft1024" if "original" in save_name2: path = gen_path(k, "test", fname, npy_dataset_name) np.save( f"{path}{save_name2}-{str(index)}.{feature[0]}", cropped_feature, ) if idx > 25 and (idx - 1) % 25 < 5 and "shift" not in save_name2: path = gen_path(k, "valid", fname, npy_dataset_name) np.save( f"{path}{save_name2}-{str(index)}.{feature[0]}", cropped_feature, ) else: path = gen_path(k, "train", fname, npy_dataset_name) np.save( f"{path}{save_name2}-{str(index)}.{feature[0]}", cropped_feature, )
def change_pitch(audio, rate, factor=1.0): return pitch_shift(audio, rate, factor)
def __call__(self, sample): n_steps = np.random.randint(-self.n_steps, self.n_steps) sample = pitch_shift(sample, self.sr, n_steps, self.bins_per_octave, self.res_type) return sample
plt.savefig('/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/original_mel.jpg') plt.close() waveform, sample_rate = torchaudio.load(audio_path) bitdepth = 6 waveform_drc = np.sign(waveform) * np.log(1 + (2 ** bitdepth - 1) * np.abs(waveform)) / np.log(2 ** bitdepth) waveform_drc = np.round(waveform_drc * (2 ** (bitdepth - 1))) / (2 ** (bitdepth - 1)) waveform_drc = np.sign(waveform_drc) * ((2 ** bitdepth) ** np.abs(waveform_drc) - 1) / (2 ** bitdepth - 1) wavio.write("/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/drc.wav", waveform_drc[0].numpy(), sample_rate, sampwidth=2) one_mel = melspectrogram(waveform_drc.squeeze(0).numpy(), sr=sample_rate, n_fft=2048, hop_length=1024, n_mels=128, fmin=0.0, fmax=sample_rate / 2, htk=True, norm=None) one_mel = np.log(one_mel + 1e-8) one_mel = (one_mel - np.min(one_mel)) / (np.max(one_mel) - np.min(one_mel)) print(np.linalg.norm(abs(waveform_drc - waveform), 2)) plt.imshow(one_mel) plt.savefig('/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/drc_mel.jpg') plt.close() waveform_pitch_shift = pitch_shift(waveform.squeeze(0).numpy(), sample_rate, 0.5) wavio.write("/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/pitch_shift.wav", waveform_pitch_shift, sample_rate, sampwidth=2) one_mel = melspectrogram(waveform_pitch_shift, sr=sample_rate, n_fft=2048, hop_length=1024, n_mels=128, fmin=0.0, fmax=sample_rate / 2, htk=True, norm=None) one_mel = np.log(one_mel + 1e-8) one_mel = (one_mel - np.min(one_mel)) / (np.max(one_mel) - np.min(one_mel)) print(np.linalg.norm(abs(waveform_pitch_shift - waveform.squeeze(0).numpy()), 2)) plt.imshow(one_mel) plt.savefig('/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/waveform_pitch_shift.jpg') plt.close()