Beispiel #1
0
def edit_sounds(path, pitch, length):
    sr = 44100
    output_path = get_random_name('wav')
    speed = HIGHLIGHT_LENGTH / length
    y, sr = librosa.load(path, sr=sr)
    write_wav(output_path, time_stretch(pitch_shift(y, sr, pitch), speed), sr)
    return output_path
Beispiel #2
0
def SavespecArg(y_mix, y_inst, fname, shift, stretch):
    Savespec(y_mix, y_inst, fname)
    for sh in shift:
        y_mix_shift = pitch_shift(y_mix, C.SR, sh)
        y_inst_shift = pitch_shift(y_inst, C.SR, sh)
        Savespec(y_mix_shift, y_inst_shift, "%s_shift%d" % (fname, sh))

        y_mix_shift = pitch_shift(y_mix, C.SR, -sh)
        y_inst_shift = pitch_shift(y_inst, C.SR, -sh)
        Savespec(y_mix_shift, y_inst_shift, "%s_shift-%d" % (fname, sh))

    for st in stretch:
        y_mix_stretch = time_stretch(y_mix, st)
        y_inst_stretch = time_stretch(y_inst, st)
        Savespec(y_mix_stretch, y_inst_stretch,
                 "%s_stretch%d" % (fname, int(st * 10)))
Beispiel #3
0
    def autotune(self, audio):
        """because the singers can't sing in tune for their livs"""
        for part_idx, (singer, part) in enumerate(self.song.items()):
            for section_idx in range(self.section_size):
                i = part_idx * self.section_size + section_idx
                print('Autotuning %s #%d...' % (singer, section_idx))

                #write that audio to disk and pitch detect
                audio_path = os.path.join(
                    self.save_path, 'tmp',
                    'raw' + self.song_name + '_' + singer + '.wav')
                pitch_path = os.path.join(
                    self.save_path, 'tmp',
                    'raw' + self.song_name + '_' + singer + '.f0')
                reaper_path = os.path.join('.', 'REAPER', 'build', 'reaper')

                soundfile.write(audio_path, audio[i], self.FS)

                subprocess.check_output(
                    [reaper_path, '-i', audio_path, '-f', pitch_path, '-a'])

                with open(pitch_path, 'r') as f:
                    reaper_output = f.read()

                reaper_output = reaper_output.split('\n')[
                    7:-1]  #skip header information
                time, voice, f0 = [], [], []
                for line in reaper_output:
                    t, v, f = line.split(' ')
                    time.append(float(t))
                    voice.append(int(v))
                    f0.append(float(f))

                # start = 0
                reaper_sample_rate = time[1] - time[0]
                time0 = 0  #current time
                time1 = 0  #
                for note in part:
                    time1 = time0 + note['duration']
                    if note['volume'] != 0:
                        #compute pitch at that interval
                        start = int(time0 / reaper_sample_rate)
                        stop = int(time1 / reaper_sample_rate)
                        actual_pitch = np.asarray([
                            f for i, f in enumerate(f0[start:stop])
                            if voice[i] == 1
                        ]).mean()
                        desired_pitch = note['pitch']
                        semitone_shift = self.semitone_diff(
                            actual_pitch, desired_pitch)

                        #perform autotuning
                        start = int(time0 * self.FS)
                        stop = int(time1 * self.FS)
                        audio[i, start:stop] = pitch_shift(
                            audio[i, start:stop], self.FS, semitone_shift)

                    time0 = time1

        return audio
 def pitch_shifting(self, sample):
     levels = [-2, -1, 1, 2, None]
     pitch_target = levels[randint(0, len(levels)-1)]
     if pitch_target is None:
         return sample
     else:
         return np.int16(pitch_shift(sample.astype(float), 16000, n_steps = pitch_target))
        def get_data(train=True):
            batch_out = []
            interval = int(self.clip_sec * self.samplerate) * 2
            for batch_idx in range(self.batch_size):
                if train:
                    rec_idx = np.random.randint(len(train_data_wav))
                    crop_idx = np.random.randint(len(train_data_wav[rec_idx][0]) - interval)
                    sources = [i[crop_idx:crop_idx + interval] for i in train_data_wav[rec_idx]]
                else:
                    rec_idx = np.random.randint(len(test_data_wav))
                    crop_idx = np.random.randint(len(test_data_wav[rec_idx][0]) - interval)
                    sources = [i[crop_idx:crop_idx + interval] for i in test_data_wav[rec_idx]]

                if config.pitch_aug and train:
                    n_steps = pitch_shift_list[np.random.randint(len(pitch_shift_list))]
                    if not n_steps==0:
                        sources = [pitch_shift(i, self.samplerate, n_steps=n_steps) for i in sources]

                sources = [from_polar(to_stft(i, self.nfft)) for i in sources]
                if config.bpm_aug and train:
                    rate = stretch_rate_list[np.random.randint(len(stretch_rate_list))]
                    if not rate==1.0:
                        for i in range(len(sources)):
                            augmented = phase_vocoder(sources[i][:, :, 0] + 1j * sources[i][:, :, 1], rate=rate)
                            sources[i] = np.array([np.real(augmented), np.imag(augmented)]).transpose(1, 2, 0)
                if config.amp_aug and train:
                    sources = [i * (0.75 + (np.random.random() * 0.5)) for i in sources]
                sources = random_crop(sources, self.ydim)
                batch_out.append(sources)

            batch_out = np.array(batch_out).transpose(1, 0, 2, 3, 4)
            if train and true_wp(config.shuffle_sources_aug_prob) == 1.0:
                for source_i in range(self.num_sources):
                    np.random.shuffle(batch_out[source_i])
            return batch_out
Beispiel #6
0
 def change_pitch(self):
     bins_per_octave = 24
     pitch_pm = 4
     pitch_change = pitch_pm * 2 * (np.random.uniform() - 0.5)
     self.X = pitch_shift(self.X,
                          self.sr,
                          n_steps=pitch_change,
                          bins_per_octave=bins_per_octave)
Beispiel #7
0
def shift_pitch(wave: np.array, sr: int, pitch_step: int = 5) -> np.array:
    """
    @topic: Change the pitch of audio wave by given speed_factor.
    @inpit: wave: audio wave, sr: sampling rate, pitch_step: the step of pitch shift.
    @return: wave_sp: pitch shifted audio wave.
    """
    wave_sp = pitch_shift(wave, sr, pitch_step)
    return wave_sp
Beispiel #8
0
def generate_pitch_shifting_augmentation(target_directory, input_directory):
    audio_file_extension = ".wav"

    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.endswith(audio_file_extension):
                audio_path = os.path.join(root, file)
                signal, sample_rate = librosa.load(audio_path, sr=None)

                librosa.output.write_wav(
                    target_directory + '/' + 'lower' + file,
                    pitch_shift(signal, sample_rate, -2), sample_rate)

                librosa.output.write_wav(target_directory + '/' + 'higher',
                                         pitch_shift(signal, sample_rate, 2),
                                         sample_rate)
Beispiel #9
0
def LoadAudio_Arg(fname, pitch_shift, time_stretch):
    y, sr = load(fname, sr=C.SR)
    if sr != C.SR:
        y = resample(y, sr, C.SR)
    y = pitch_shift(y, C.SR, pitch_shift)
    y = time_stretch(y, time_stretch)
    spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE)
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase
Beispiel #10
0
def pitch_shift_wavfile(wav, sr, n_octaves):
    peak = max(np.abs(np.max(wav)), np.abs(np.min(wav)))
    if n_octaves == 0:
        return wav
    new_wav = pitch_shift(y=wav.astype(float),
                          sr=sr,
                          n_steps=n_octaves * 12,
                          bins_per_octave=12)
    new_peak = max(np.abs(np.max(new_wav)), np.abs(np.min(new_wav)))
    new_wav = peak * new_wav / new_peak
    return new_wav
Beispiel #11
0
def create_fetures(audio_path, aug_index):
    waveform, sample_rate = torchaudio.load(audio_path)
    if aug_index == 1:
        waveform = waveform + torch.randn(waveform.shape) * 0.001
    if aug_index == 2:
        waveform = torchaudio.functional.contrast(waveform)
    if aug_index == 3:
        if waveform.shape[0] == 2:
            waveform = torch.cat([
                torch.tensor(
                    pitch_shift(
                        waveform.squeeze(0).numpy()[0, :], sample_rate,
                        np.random.random(1)[0])).unsqueeze(0),
                torch.tensor(
                    pitch_shift(
                        waveform.squeeze(0).numpy()[1, :], sample_rate,
                        np.random.random(1)[0])).unsqueeze(0)
            ],
                                 dim=0)
        else:
            waveform = torch.tensor(
                pitch_shift(
                    waveform.squeeze(0).numpy(), sample_rate,
                    np.random.random(1)[0])).unsqueeze(0)
    if aug_index == 4:
        waveform = drc(waveform, bitdepth=6)
    if aug_index == 5 and '-a' in audio_path:
        with open(FILTER_RESPONSES.replace('s.', f's{random.randint(1, 6)}.'),
                  'rb') as handle:
            h = pickle.load(handle)
        waveform = torch.tensor(
            np.convolve(waveform.squeeze(0).numpy(), h, 'same')).unsqueeze(0)
    if waveform.shape[0] == 2:
        full_mel_3d = torch.cat([
            create_mels_deltas(waveform[0], sample_rate),
            create_mels_deltas(waveform[1], sample_rate)
        ],
                                dim=0)
    else:
        full_mel_3d = create_mels_deltas(waveform, sample_rate)
    return full_mel_3d
Beispiel #12
0
    def pitch_shift(self, n_steps, **kwargs):
        """
        Shift the pitch of the audio time series and return the new object.

        This method is a wrapper over librosa_'s ``pitch_shift`` method.

        .. _librosa: https://librosa.org
        """
        new = self.copy()
        new.data = pitch_shift(new.data, new.fs, n_steps, **kwargs)

        return new
Beispiel #13
0
    def apply(self, waveform, **params):
        assert waveform.shape[0] == 1, 'waveform should have 1-channel'
        assert waveform.shape[1] > 0, 'waveform is empty'
        waveform = waveform.clone()

        n_steps = np.random.randint(
            self.min_steps, self.max_steps
        )  # n_steps < 0 -- shift down, n_steps > 0 -- shift up
        waveform = pitch_shift(waveform[0].numpy(),
                               self.sr,
                               n_steps=n_steps,
                               bins_per_octave=12)

        return torch.tensor(waveform, dtype=torch.float).unsqueeze(0)
Beispiel #14
0
 def pitch_shifting(self, sample):
     """
     Modify the pitch of the audio
     ref: https://arxiv.org/pdf/1608.04363.pdf 
     """
     levels = [-2, -1, 1, 2, None]
     pitch_target = levels[randint(0, len(levels) - 1)]
     if pitch_target is None:
         return sample
     else:
         return np.int16(
             pitch_shift(sample.astype(float),
                         SEQ_LENGTH,
                         n_steps=pitch_target))
def speed_up(video_clip, speed):
    rate = 44100

    # Speed up video
    video_clip = video_clip.speedx(speed)

    # Determine pitch shift from speed
    shift = (1 - speed if speed >= 1 else (1 / speed) - 1) * 12

    # Fix audio pitch
    audio = video_clip.audio.to_soundarray(fps=rate).transpose()
    for i, channel in enumerate(audio):
        audio[i] = pitch_shift(channel, rate, shift)
    audio = audio.transpose()

    video_clip.audio = AudioArrayClip(audio, fps=rate)

    return video_clip
Beispiel #16
0
def increasePitch(path, fileName, outputDir):
    """Increased a video segment's pitch by one semitone by extracting its audio track and saving it as
    a temporary external file

    Args:
        path (string): The path to the current video file
        fileName (string): The filename for the temporary audio file
        outputDir (string): The output directory
    """

    # Import audio from video
    print('Extracting audio from: ' + path)
    audio, sr = librosa.load(path)
    # Increase its pitch
    increasedPitch = pitch_shift(audio, sr, 1)
    # Save the .wav
    print('Saving audio .wav to: ' + outputDir + fileName)
    librosa.output.write_wav(outputDir + fileName + '.wav', increasedPitch, sr)
    # Convert the .wav to .mp3
    print('Converting to .mp3 from: ' + outputDir + fileName)
    mp3 = AudioSegment.from_wav(outputDir + fileName + '.wav')
    # Save the .mp3
    print('Saving .mp3 to: ' + outputDir + fileName)
    mp3.export(outputDir + fileName + '.mp3', 'mp3')
Beispiel #17
0
wf += np.abs(0.2+np.cos(p2*4*t))*np.cos(p2*120*t)
sd.play(wf, fs, device=8)
swrite('swav.wav', fs, wf)

plt.figure()
plt.plot(t, wf)
plt.show(block=False)

sd.play(10*np.sin(2*3.14159*220*(2**(1/12))*t), fs, device=8)
440*2**(1/12)440*2**(1/12)

sd.play(100*np.sin(2*3.14159*440*(2**(3/12))*t), fs, device=8)


wf = 10*np.sin(2*3.14159*220*(2**(1/12))*t)
wfps = pitch_shift(wf, fs, n_steps=0.5)
sd.play(wf, fs)
sd.play(wfps, fs)

mr = sd.rec(fs*dur, fs, channels=1, dtype='float64', device=1)
mr = mr.squeeze()
mrps = pitch_shift(mr, fs, n_steps=-5)

sd.play(10*mrps, fs)

sd.query_devices()


mrps.shape
mrps.dtype
np.amin(mrps)
Beispiel #18
0
def pitch_shift(x, sr=16000, n_steps=15):
    return effects.pitch_shift(x, sr, torch.randint(low=-n_steps, high=n_steps, size=[1]).item())
Beispiel #19
0
 e_dissonance_dict = {}
 audio_target = std.MonoLoader(filename=target)()
 print("Target:", target)
 print("======================")
 for candidate in filelist:
     print("Candidate:", candidate)
     audio_candidate = std.MonoLoader(filename=candidate)()
     harmonicities = np.zeros(12)
     inharmonicities = np.zeros(12)
     dissonances = np.zeros(12)
     for pshift in range(12):
         if pshift == 0:
             mod_candidate = audio_candidate
         else:
             if pshift > 5:
                 mod_candidate = pitch_shift(audio_candidate, 44100,
                                             pshift - 12).astype(np.float32)
             else:
                 mod_candidate = pitch_shift(audio_candidate, 44100,
                                             pshift).astype(np.float32)
         mix_audio = mix(audio_target, mod_candidate, 44100)
         spf, mpf = utils.get_sines_per_frame(mix_audio, 44100)
         hpf, hpm = utils.get_hpeaks_per_frame(mix_audio, 44100)
         pcs = transform_to_pc(spf)
         n_frames = pcs.shape[0]
         har_frame = np.zeros(n_frames)
         inh_frame = np.zeros(n_frames)
         dis_frame = np.zeros(n_frames)
         for i in range(n_frames):
             # Peter's harmonicity part
             mspec = milne_pc_spectrum(pcs[i])
             har_frame[i] = ph_harmon(mspec)
from os import path
import numpy as np
import soundfile as sf
from audio_helpers import play_audio
from librosa.effects import pitch_shift
from librosa.core import load
from librosa.output import write_wav


DIR = 'data/wave'
fn = '2.wav'
base_name, ext = path.splitext(fn)
FN = path.join(DIR, fn)
FN_NEW = path.join(DIR, '{}_shifted{}'.format(base_name, ext))

x, fs = load(FN)
print "Script loaded file with fs {}".format(fs)


def to_pcm(x):
    max_val = np.iinfo(np.int16).max
    return (x * max_val).astype(np.int16)


shifted = pitch_shift(x, fs, 2)
sf.write(FN_NEW, shifted, fs, subtype="PCM_24")
play_audio(FN_NEW)
Beispiel #21
0
starting_times = (9, 25, 40, 52, 65, 77, 89, 121, 137, 157, 170, 184)
# times in seconds when each note starts in the input video
length = 8  # length of the generated clips
octaves = (0, 9)  # range of octaves
resolution = (600, 800)  # video resolution of the outputs

notes = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G',
         'G#']  # name of the notes

src_clips = [VideoFileClip(input_file).\
             subclip(start, start + length).\
             volumex(0.6)
             for start in starting_times]
# this list contains the clips for each note in the input video

for n in range(octaves[0] * 12, octaves[1] * 12):
    note_name = f"{n // 12}{notes[(n % 12)]}"
    clip = src_clips[(n - 3) % 12].copy()
    # the '-3' is because notes starts with 'C' but during the generation they starts from 'A'
    trans = (((n - 3) // 12) - 5) * 12  # transposition offset
    clip.audio = AudioArrayClip(
        np.transpose(
            np.stack([
                pitch_shift(  # this function does the trick
                    np.transpose(clip.audio.to_soundarray())[channel],
                    48000,  # it's the sampling of the source video
                    n_steps=trans) for channel in (0, 1)
            ])),  # (0, 1) because the source is stereo
        fps=48000)
    clip.write_videofile(f"{note_name}.mp4")
def create_dataset_for_one_song(
    song_name,
    wav_path,
    y,
    sr,
    k,
    idx,
    npy_dataset_name,
    crop_size,
    beat_dir_path,
):
    shifts = [-12, -6, 0, 6, 12]
    for shift in tqdm(shifts, desc=song_name):
        shifted_y = y if shift == 0 else pitch_shift(y, sr, n_steps=shift)
        save_name1 = song_name if shift == 0 else f"{song_name}shift{shift}"
        for stretch_i in range(5):  # stretch 5 times randomly
            if stretch_i == 0:
                save_name2 = f"{save_name1}original" if shift == 0 else save_name1
                beat_path = f"{beat_dir_path}{song_name}.BEAT.TXT"
                melspec = convertAudio2MelSpec(wav_path)  # 100Hz
                activation, downbeats = convertBeatText2Activation(
                    beat_path, song_length=len(melspec), units="ms"
                )
                bpm = convertBeatText2Bpm(beat_path, len(melspec))
                max_bpm = np.max(bpm)
            else:
                max_rate = 300 / (max_bpm + 10)
                stretch_rates = [None, 0.5, 0.75, max_rate / 2, max_rate - 0.01]
                # stretch_rate = random.choice(np.arange(0.5, max_rate, 0.05))
                stretch_rate = stretch_rates[stretch_i]
                rounded_rate = np.round(stretch_rate, decimals=2)
                save_name2 = f"{save_name1}stretch{stretch_i}x{rounded_rate}"
                stretched_y = time_stretch(shifted_y, stretch_rate)
                melspec = convertAudio2MelSpec(None, True, stretched_y, sr)  # 100Hz
                stretched_activation = stretch_beat(activation, stretch_rate)
                stretched_bpm = stretch_bpm(bpm, stretch_rate)
                stretched_downbeats = (np.rint(downbeats / stretch_rate)).astype(
                    np.int64
                )
            beattheta = activation2beattheta(
                activation if stretch_i == 0 else stretched_activation
            )
            bartheta = activation2bartheta(
                activation if stretch_i == 0 else stretched_activation,
                downbeats if stretch_i == 0 else stretched_downbeats,
            )
            assert np.max(bartheta) > 0.5
            assert np.max(beattheta) > 0.5
            features = [
                ["melspec", melspec],
                ["activation", activation if stretch_i == 0 else stretched_activation],
                ["bpm", bpm if stretch_i == 0 else stretched_bpm],
                ["beattheta", beattheta],
                ["downbeattheta", bartheta],
            ]
            for feature in features:
                if feature[0] == "bpm":
                    feature[1][feature[1] > 300] = 300
                feature[1] = np.ascontiguousarray(feature[1])
                feature_length = len(feature[1])
                cropped_features = (
                    [feature[1]]
                    if feature_length < crop_size
                    else librosa.util.frame(
                        x=feature[1],
                        frame_length=crop_size,
                        hop_length=crop_size,
                        axis=0,
                    )
                )
                for index, cropped_feature in enumerate(cropped_features):
                    cropped_feature[cropped_feature < 0] = 0
                    fname = feature[0]
                    if fname == "activation":
                        fname = "beat"
                    elif fname == "melspec":
                        fname = f"melspec_sr{sr}_nfft1024"
                    if "original" in save_name2:
                        path = gen_path(k, "test", fname, npy_dataset_name)
                        np.save(
                            f"{path}{save_name2}-{str(index)}.{feature[0]}",
                            cropped_feature,
                        )
                    if idx > 25 and (idx - 1) % 25 < 5 and "shift" not in save_name2:
                        path = gen_path(k, "valid", fname, npy_dataset_name)
                        np.save(
                            f"{path}{save_name2}-{str(index)}.{feature[0]}",
                            cropped_feature,
                        )
                    else:
                        path = gen_path(k, "train", fname, npy_dataset_name)
                        np.save(
                            f"{path}{save_name2}-{str(index)}.{feature[0]}",
                            cropped_feature,
                        )
Beispiel #23
0
def change_pitch(audio, rate, factor=1.0):
    return pitch_shift(audio, rate, factor)
Beispiel #24
0
    def __call__(self, sample):
        n_steps = np.random.randint(-self.n_steps, self.n_steps)

        sample = pitch_shift(sample, self.sr, n_steps, self.bins_per_octave,
                             self.res_type)
        return sample
Beispiel #25
0
plt.savefig('/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/original_mel.jpg')
plt.close()

waveform, sample_rate = torchaudio.load(audio_path)
bitdepth = 6
waveform_drc = np.sign(waveform) * np.log(1 + (2 ** bitdepth - 1) * np.abs(waveform)) / np.log(2 ** bitdepth)
waveform_drc = np.round(waveform_drc * (2 ** (bitdepth - 1))) / (2 ** (bitdepth - 1))
waveform_drc = np.sign(waveform_drc) * ((2 ** bitdepth) ** np.abs(waveform_drc) - 1) / (2 ** bitdepth - 1)
wavio.write("/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/drc.wav", waveform_drc[0].numpy(), sample_rate,
            sampwidth=2)
one_mel = melspectrogram(waveform_drc.squeeze(0).numpy(), sr=sample_rate, n_fft=2048, hop_length=1024,
                         n_mels=128, fmin=0.0, fmax=sample_rate / 2, htk=True, norm=None)
one_mel = np.log(one_mel + 1e-8)
one_mel = (one_mel - np.min(one_mel)) / (np.max(one_mel) - np.min(one_mel))
print(np.linalg.norm(abs(waveform_drc - waveform), 2))
plt.imshow(one_mel)
plt.savefig('/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/drc_mel.jpg')
plt.close()

waveform_pitch_shift = pitch_shift(waveform.squeeze(0).numpy(), sample_rate, 0.5)
wavio.write("/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/pitch_shift.wav", waveform_pitch_shift, sample_rate,
            sampwidth=2)
one_mel = melspectrogram(waveform_pitch_shift, sr=sample_rate, n_fft=2048, hop_length=1024,
                         n_mels=128, fmin=0.0, fmax=sample_rate / 2, htk=True, norm=None)
one_mel = np.log(one_mel + 1e-8)
one_mel = (one_mel - np.min(one_mel)) / (np.max(one_mel) - np.min(one_mel))
print(np.linalg.norm(abs(waveform_pitch_shift - waveform.squeeze(0).numpy()), 2))
plt.imshow(one_mel)
plt.savefig('/mnt/disks/nlioz/DCASE2021/dcase2020/output_test/waveform_pitch_shift.jpg')
plt.close()