Example #1
0
def test_missing_cli(cli, ctx):
    '''Simulate not having rubberband-cli installed and check for
    the appropriate exception.
    '''

    with ctx:
        pyrubberband.pyrb.__RUBBERBAND_UTIL = cli
        pyrubberband.pitch_shift(np.random.randn(22050), 22050, 1)
Example #2
0
    def pitch_shift_test(self):
        '''
        Function for testing the audio degredation created by the pitch shifting
        This is now ahead of the other stuff

        TODO figure out why the duration is exploding -> bytes are being increased when we filter
        '''
        print("In test")
        print("Ownership Duration")
        #print(self.ownership_AudioSegment.duration_seconds)

        channles = self.ownership_AudioSegment.channels
        sample_width = self.ownership_AudioSegment.sample_width
        semitones = 5 #Semitones(200) + minHuman speech(100) = inaudible Audio

        #Set high and low bounds on original Ownership Audio
        seg = self.ownership_AudioSegment
        seg.set_frame_rate(self.original_sample_rate)
        cleaned_ownership_sound = seg.low_pass_filter(200) #Filter out noise above this
        cleaned_ownership_sound2 = cleaned_ownership_sound.high_pass_filter(50) #And below this (human speech is typically 100-150Hz)
        cleaned_ownership_sound2.set_frame_rate(self.original_sample_rate)
        cleaned_ownership_sound2.export("./pst_original.mp3", format="mp3") #This is just so we can see it
        ownership_wav_data = np.array(cleaned_ownership_sound2.get_array_of_samples())

        #Pitch shift ownership audio to inaudible
        inaudible_samples = pyrubberband.pitch_shift(ownership_wav_data, self.original_sample_rate, semitones)

        # Pitch shift back down to audible and clean again
        audible_samples = pyrubberband.pitch_shift(inaudible_samples, self.original_sample_rate, -semitones)

        print(audible_samples.shape)
        print("checking eq")
        print(np.allclose(audible_samples, ownership_wav_data)) #samples are way off, why
        mse = (np.square(audible_samples - ownership_wav_data)).mean(axis=0)
        print(mse) #Huge MSE off of a pitch shift of just 5

        sf.write('./pst_temp.wav', audible_samples, self.original_sample_rate)
        sound = AudioSegment.from_file('./pst_temp.wav', format="wav", sample_width=sample_width, frame_rate=self.original_sample_rate, channels=channles)
        other_samples = np.array(sound.get_array_of_samples())
        print(other_samples.shape)
        print("Transformed duration: ")
        print(sound.duration_seconds)
        cleaned_sound = sound.low_pass_filter(300)
        cleaned_sound2 = cleaned_sound.high_pass_filter(2)
        cleaned_sound2.set_frame_rate(self.original_sample_rate)
        cleaned_sound2 = cleaned_sound2+10
        print("Cleaned Transformed Duration")
        print(cleaned_sound2.duration_seconds)
        cleaned_sound2.export('./pst_transformed.mp3', format="mp3")
Example #3
0
def postprocess_source(file: Path, config: DatasetConfig):
    """
    Applies postprocessing on main source signal (positive or negative word)
    :param file: path to main signal file
    :param config: dataset config
    :return: completed source signal
    """
    main_signal, _ = librosa.load(str(file), sr=config.sample_rate)
    undercuts = [np.random.uniform(*config.word_undercut_range) for _ in range(2)]
    left, right = [config.sec_to_samples(x) for x in undercuts]
    main_signal = main_signal[left:-right]

    if np.random.random() < config.rubberband_ratio:
        pitch = np.random.uniform(*config.rubberband_pitch_range)
        main_signal = pyrubberband.pitch_shift(main_signal, config.sample_rate, pitch)

    bg_source = get_weighted_item(config.background_sources)
    bg_file = get_random_file(AUDIOSET_PATH / bg_source)
    bg_signal, _ = librosa.load(str(bg_file), sr=config.sample_rate)
    bg_signal = get_random_chunk(bg_signal, config.sample_length, config.sample_rate)
    bg_gain = np.random.uniform(*config.background_gain_range)
    bg_signal = bg_gain * librosa.util.normalize(bg_signal) * np.max(main_signal)

    right_margin = np.random.uniform(*config.word_right_margin_range)
    main_signal_end = config.sample_length - right_margin
    return merge_signals(
        (bg_signal, 0, None),
        (main_signal, None, main_signal_end),
        sample_rate=config.sample_rate,
        length=config.sample_length,
    )
def getPitchShiftedSpecs(X, Fs, W, H, shiftrange = 6, GapWins = 10):
    """
    Concatenate a bunch of pitch shifted versions of the spectrograms
    of a sound, using the rubberband library
    :param X: A mono audio array
    :param Fs: Sample rate
    :param W: Window size
    :param H: Hop size
    :param shiftrange: The number of halfsteps below and above which \
        to shift the sound
    :returns SRet: The concatenate spectrogram
    """
    SRet = np.array([])
    for shift in range(-shiftrange, shiftrange+1):
        print("Computing STFT pitch shift %i"%shift)
        if shift == 0:
            Y = np.array(X)
        else:
            Y = pyrb.pitch_shift(X, Fs, shift)
        S = STFT(Y, W, H)
        Gap = np.zeros((S.shape[0], GapWins), dtype=np.complex)
        if SRet.size == 0:
            SRet = S
        else:
            SRet = np.concatenate((SRet, Gap, S), 1)
    return SRet
Example #5
0
    def convert_from_human_inaudible(self):
        #TODO Find out where degredation in negative shift is coming from -> alternativly could apply fade to require lesser pitch shift
        #TODO Find out a way to do this without access to the original audio file
        '''
        Want to either be able to sample above a certain frequency or otherwise get the difference between the two files
        Otherwise we can overlay the inverse of the original
        '''
        start_time = 0
        duration = self.ownership_AudioSegment.duration_seconds
        semitones = -self.pitch_shift

        sound1 = AudioSegment.from_file(self.combined_path, start_second=start_time, duration=duration, format="wav")
        sound1.set_frame_rate(self.higher_sample_rate)
        original_sound = AudioSegment.from_file(self.original_file, start_second=start_time, duration=duration, format="wav")
        original_sound.set_frame_rate(self.higher_sample_rate)
        sound2 = original_sound.invert_phase() #This should cancel out the original audio in our clip

        combined = sound1.overlay(sound2) #This should be just our pitch shifted ownership audio
        combined.set_frame_rate(self.higher_sample_rate)
        wav_data = np.array(combined.get_array_of_samples())

        try:
            audible_ownership = pyrubberband.pitch_shift(wav_data, self.higher_sample_rate, semitones) #This should reverse the original pitch shift
            temp2 = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            sf.write(temp2.name, audible_ownership, self.higher_sample_rate)
            wav_to_mp3(temp2.name, './recovered_ownership.mp3', force=True) #Converted to mp3 for convenience
            temp2.close()
        except Exception as e:
            print(e)
Example #6
0
def getPitchShiftedRubberbandCQTs(X,
                                  Fs,
                                  CShape,
                                  bins_per_octave,
                                  shiftrange=6,
                                  GapWins=10):
    """
    Concatenate a bunch of pitch shifted versions of the NSGT CQT of X to each other.
    Use the rubberband library to get the best results (even though pitch shifting
    could be done with CQT + Griffin Lim, as with Nakamura)
    :param X: A 1D array of audio samples
    :param Fs: Sample rate
    :param bins_per_octave: Bins per octave in the NSGT
    :param shiftrange: The number of halfsteps below and above which \
        to shift the sound
    :param GapWins: The length of the gap to include between \
        pitch shifted CQTs
    :returns CRet: The concatenate CQT spectrogram with all pitch shifts
    """
    import pyrubberband as pyrb
    CRet = np.array([], dtype=np.complex)
    for shift in range(-shiftrange, shiftrange + 1):
        Y = pyrb.pitch_shift(X, Fs, shift)
        thisC = np.zeros(CShape, dtype=np.complex)
        thisCi = getNSGT(Y, Fs, bins_per_octave)
        thisC[:, 0:thisCi.shape[1]] = thisCi
        if CRet.size == 0:
            CRet = thisC
        else:
            Gap = np.zeros((thisC.shape[0], GapWins), dtype=np.complex)
            CRet = np.concatenate((CRet, Gap, thisC), 1)
    return CRet
Example #7
0
def pitchShift(loop, pitch):
    y, sr = librosa.load(can_path + loop, sr=44100)
    sf.write(loop, y, samplerate=44100)  #original candidate
    # pitch shifting (maybe a little difference after shifting)
    y_shift = pyrb.pitch_shift(y, sr, n_steps=-pitch)
    sf.write(can_ps_output, y_shift, samplerate=44100)
    #y_shift = librosa.effects.pitch_shift(y, sr, n_steps=pitch) #by liborsa
    '''
def rb_pitch(data, sample_rate):
    """
    Pitch Tuning.
    """
    bins_per_octave = 12
    pitch_pm = 2
    pitch_change =  pitch_pm * 2*(np.random.uniform())   
    data = pitch_shift(data.astype('float64'), sample_rate, n_steps=pitch_change)  # , bins_per_octave=bins_per_octave
Example #9
0
 def __call__(self, wav=None,
              sr=22050):
     assert len(wav.shape)==1
     if random.random() < self.prob:
         alpha = self.limit * random.uniform(-1, 1)
         if self.use_pyrb:
             wav = pyrb.pitch_shift(wav, sr, alpha)
         else:
             wav = librosa.effects.pitch_shift(wav, sr, n_steps=alpha)
     return {'wav': wav,'sr': sr}
Example #10
0
    def __call__(self, x, shift=1):
        """Shift the pitch of given signal
        
        Args:
            x (numpy.ndarray): input signal (n_samples,)
            shift (float, int): degree of shifting (unit:semitones)

        Returns:
            numpy.ndarray: output (n_samples,)
        """
        y = pyrb.pitch_shift(x, self.sample_rate, shift)
        return y
Example #11
0
    def convert_to_human_inaudible(self, output_path):
        #TODO figure out exact semitones to inaudible
        #TODO figure out how to speed this up (chunk processing)
        seg = self.ownership_AudioSegment
        seg.set_frame_rate(self.higher_sample_rate)
        wav_data = np.array(seg.get_array_of_samples())
        semitones = self.pitch_shift

        try:
            inaudible_samples = pyrubberband.pitch_shift(wav_data, self.higher_sample_rate, semitones)
            sf.write(output_path, inaudible_samples, self.original_sample_rate)
        except Exception as e:
            print(e)
Example #12
0
 def do_agumentation(self):
     no_class = os.listdir(self.input_path)
     for name in no_class:
         files = os.listdir(self.input_path + name + "/")
         files = [f for f in files if f.endswith(".wav")]
         for i, audio in enumerate(files):
             print(audio)
             y, sr = sf.read(self.input_path + name + "/" + audio)
             time = random.uniform(0.6, 1.3)
             y_strech = pyrb.time_stretch(y, sr, time)
             y_agument = pyrb.pitch_shift(y_strech, 22050, 1)
             # print(y_agument)
             wav.write(self.output_path + name + "/" + "agumented_" + audio,
                       sr, y_agument)
             print(name + "/" + "agumented_" + audio,
                   "has augmented and saved")
Example #13
0
def pitchshift(folder, filename, shifts=24):
    """
    Pitch shift of the audio file given as input and save in in the folder given as input

    Args:
        folder (str): path to folder where to save the pitch shifted audio
        filename (str): path to audio file to pitch-shift
        shifts (int, optional): shift to apply. Defaults to 24.
    """
    logger.info("loading audio")
    audio, orig_sr = librosa.load(filename)
    audio = librosa.resample(audio, orig_sr, target_sr)

    logger.info("shifting pitch")
    root = 40  # e2 is 40

    folder = pathlib.Path(folder).absoule()

    for n_steps in range(0, shifts + 1):
        # audio_shifted = librosa.effects.pitch_shift(audio, target_sr, n_steps, bins_per_octave=12)
        audio_shifted = pyrb.pitch_shift(audio, target_sr, n_steps)
        new_filename = f"{root+n_steps}.wav"
        new_filepath = folder / new_filename
        audio_shifted = audio_shifted.astype("float32")
        write(new_filepath, target_sr, audio_shifted)
        logger.debug(f"Creating: {new_filename}")
        logger.debug("==============================")

        # write("{}{}.wav".format(folder, i+root), y_shift, sr)
        # write(f'{folder}/{root+i}.wav', sr, y_shift)

    for n_steps in range(0 - shifts // 2, (shifts // 2) + 1):
        # amplitude normalization
        new_filename = f"{root+n_steps}.wav"
        new_filepath = folder / new_filename
        sound = AudioSegment.from_file(new_filepath, "wav")
        normalized_sound = match_target_amplitude(sound, -30.0)
        # another way
        # normalized_sound = effects.normalize(sound)
        normalized_sound.export(new_filepath, format="wav")
        logger.debug(f"Normalizing: {new_filename}")
        logger.debug("==============================")

    logger.info(f"Audio files saved in folder: {folder}")
Example #14
0
def test_pitch(sr, num_samples, freq, n_step):

    y = synth(sr, num_samples, freq)

    y_s = pyrubberband.pitch_shift(y, sr, n_step)

    # Make sure we have the same duration
    assert np.allclose(len(y), len(y_s))

    # compare to directly synthesize target track

    # we'll compare normalized power spectra to avoid phase issues
    t_freq = freq * 2.0**(n_step / 12.0)
    y_f = synth(sr, num_samples, t_freq)

    s_s = np.abs(np.fft.rfft(y_s))
    s_f = np.abs(np.fft.rfft(y_f))

    assert np.allclose(s_s / s_s[0], s_f / s_f[0], atol=1e-2)
Example #15
0
def test_pitch(sr, num_samples, freq, n_step):

    y = synth(sr, num_samples, freq)

    y_s = pyrubberband.pitch_shift(y, sr, n_step)

    # Make sure we have the same duration
    assert np.allclose(len(y), len(y_s))

    # compare to directly synthesize target track

    # we'll compare normalized power spectra to avoid phase issues
    t_freq = freq * 2.0**(n_step / 12.0)
    y_f = synth(sr, num_samples, t_freq)

    s_s = np.abs(np.fft.rfft(y_s))
    s_f = np.abs(np.fft.rfft(y_f))

    assert np.allclose(s_s / s_s[0], s_f / s_f[0], atol=1e-2)
Example #16
0
def _set_pitch_shifted_path(audio_object):
    pitch_semitones = audio_object['pitch_semitones']
    file = audio_object['file']
    pitch_shifted_path = audio_object['absolute_path']

    # pitch shift audio as needed
    if pitch_semitones:
        shifted_file = f'PITCH SHIFTED {pitch_semitones}: {Path(file).stem}.wav'
        pitch_shifted_path = join(PLAYLIST_PATH, shifted_file)

        # create new pitch shifted file if not exists
        if not Path(pitch_shifted_path).is_file():
            y, sr = librosa.load(audio_object['absolute_path'])
            print(f'shifting file by {pitch_semitones}: "{file}"')
            y_shift = pyrubberband.pitch_shift(y, sr, pitch_semitones)
            soundfile.write(pitch_shifted_path, y_shift, sr)
        else:
            print(f'CACHED SHIFT: "{shifted_file}"')

    audio_object['pitch_shifted_path'] = pitch_shifted_path
Example #17
0
    def augment(self, array, count):
        # Original signal
        # Second argument determines type of aumentation applied to signal
        self.sigToImage(array, 1, count)

        # Noise addition using normal distribution with mean = 0 and std =1
        # Permissible noise factor value = x > 0.004
        noiseAdding = array + 0.009 * np.random.normal(0, 1, len(array))
        self.sigToImage(noiseAdding, 2, count)

        # Permissible factor values = samplingRate / 100
        timeShifting = np.roll(array, int(500 / 100))
        self.sigToImage(timeShifting, 3, count)

        # Permissible factor values = -5 <= x <= 5
        pitchShifting = pyrb.pitch_shift(array, 500, -3)
        self.sigToImage(pitchShifting, 4, count)

        # Permissible factor values = 0 < x < 1.0
        factor = 0.95  # Yields the best reults without losing ecg wave shape
        timeStretching = pyrb.time_stretch(array, 500, factor)
        self.sigToImage(timeStretching, 5, count)
Example #18
0
 def audio(mudabox, state):
     mudabox._audio['y'] = pyrb.pitch_shift(mudabox._audio['y'],
                                            mudabox._audio['sr'],
                                            state['n_semitones'])
def testPitchShift(X, Fs, W, H, shift, filename):
    W = 2048
    H = 128
    S = np.abs(STFT(X, W, H))
    S = pitchShiftSTFT(S, Fs, shift)
    X2 = griffinLimInverse(S, W, H, 20)
    wavfile.write(filename, Fs, X2)

if __name__ == '__main__':
    import librosa
    import pyrubberband as pyrb
    X, Fs = librosa.load("music/Beatles_LetItBe.mp3", sr=44100)
    print("Fs = ", Fs)
    shift = 2
    noctaves = 7
    y = pyrb.pitch_shift(X, Fs, shift)
    wavfile.write("rubberbandshift%i.wav"%shift, Fs, y)
    testPitchShift(X, Fs, 2048, 128, shift, "gfshift%i_stft.wav"%shift)

    from NMF import shiftMatLRUD
    winSize = 4096
    hopSize = 256
    S = STFT(X, winSize, hopSize)
    S = np.abs(S)
    M = warpSTFTMel(S, Fs, winSize)
    M = shiftMatLRUD(M, di=shift*2)
    S = unwarpSTFTMel(M, Fs, winSize)
    y = griffinLimInverse(S, winSize, hopSize)
    y = y/np.max(np.abs(y))
    wavfile.write("melshift%i.wav"%shift, Fs, y)
Example #20
0
                          mode="same")
        x_res[safe_index] += inv

    print("filter cost {}".format(time.time() - start_time))
    return x_res, x_glottal_res, recons_psds, recons_vt_psds


x_res, x_glottal_res, recons_psds, recons_vt_psds = inverse_lpc_fftconvolve(
    x, dat)

wavwrite('x_res.wav', fs, (x_res * 2**15).astype(np.int16))
wavwrite('x_glottal_res.wav', fs, (x_glottal_res * 2**15).astype(np.int16))

shift = -12
import pyrubberband as pyrb
x_res = pyrb.pitch_shift(x_res, fs, shift)

y = synthesisRequiem.get_waveform(x_res, np.transpose(recons_psds, [1, 0]),
                                  dat['temporal_positions'], dat['f0'],
                                  dat['fs'])

x_glottal_res = pyrb.pitch_shift(x_glottal_res, fs, shift)
y_from_glottal = synthesisRequiem.get_waveform(
    x_glottal_res, np.transpose(recons_vt_psds, [1, 0]),
    dat['temporal_positions'], dat['f0'], dat['fs'])

wavwrite('x_recons.wav', fs, (y * 2**15).astype(np.int16))
wavwrite('x_recons_glottal.wav', fs, (y_from_glottal * 2**15).astype(np.int16))

from scipy import interpolate
import copy
import wave
import sys
from pydub import AudioSegment
import soundfile as sf
import pyrubberband as pyrb

# sound = AudioSegment.from_mp3(sys.argv[1])
# sound.export("file.wav", format="wav")

y, sr = sf.read("0.wav")
y_stretch = pyrb.time_stretch(y, sr, 0.90)
y_shift = pyrb.pitch_shift(y, sr, 0.90)
sf.write("analyzed_filepathX5.wav", y_stretch, sr, format='wav')
Example #22
0
def pitch_shifting(sig, sr, degree):
    return pyrb.pitch_shift(sig, sr, degree)
Example #23
0
    def pitchshift(self, n):
        note = n - self.startNote
        sound = np.int16(
            prb.pitch_shift(self.sample, self.rate, note) * (2**15))

        return sound
y = y[:3 * 44100]
wn = np.clip((np.random.rand(len(y)) * 2 - 1) * 0.01, -1, 1)
y_wn = np.clip(y + wn, -1, 1)

gain_float = gain = 10**(6 / 20)
y_gain = np.clip(y * gain_float, -1, 1)

B, A = signal.butter(5, 5000 / (44100 / 2), btype='lowpass')
y_lp = signal.lfilter(B, A, y, axis=0)

B, A = signal.butter(5, 10000 / (44100 / 2), btype='highpass')
y_hp = signal.lfilter(B, A, y, axis=0)

y_stretch = pyrb.time_stretch(y, 44100, 0.7)[:3 * 44100]

y_pitch = pyrb.pitch_shift(y, 44100, -5)

fig, (ax1, ax2, ax3) = plt.subplots(ncols=2, nrows=3)
fig.tight_layout(pad=3)
Pxx, freqs, bins, im = ax1[0].specgram(y,
                                       NFFT=1024,
                                       Fs=44100,
                                       noverlap=900,
                                       scale_by_freq=True,
                                       detrend="mean")
ax1[0].set_title('Original')
ax1[0].set(ylabel='Frequency (Hz)')
ax1[0].xaxis.set_ticks([])
ax1[0].yaxis.set_ticks([0, 10_000, 20_000])
ax1[0].yaxis.set_ticklabels(["0", "10k", "20k"])
Pxx, freqs, bins, im = ax1[1].specgram(y_wn,
    def __init__(
        self, files, limit=None, augment=False, duplicate=1, seed="42", device=torch.device("cpu"),
    ):
        self.device = device
        random = Random(seed)  # init random generator
        pos_prep = PositionalEncodingLabeler(POS_DIM, scale=POS_SCALE)

        self.counts = []

        base = self.base

        if limit is not None:
            files = files[:limit]

        inp, out_vec, out_int, out_map, out_dur, out_trans, out_trans_ints = (
            [],
            [],
            [],
            [],
            [],
            [],
            [],
        )
        position, border, weight, inp_mfcc = [], [], [], []

        duplicate_set = set()
        self.files = []

        def stack(arr, tensor):
            return [tensor(a).to(self.device) for a in arr]

        for i, (label_file, audio_file) in enumerate(files * duplicate):
            assert self.get_name(label_file) == self.get_name(audio_file)
            a, b, c = self.get_name(label_file)
            f"{a}_{b}_{c}_{i}"

            label_file = os.path.join(base, "data", label_file)
            audio_file = os.path.join(base, "data", audio_file)

            def loader():
                return AudioCaching.load(audio_file)

            audio = self.get_set(audio_file, loader)
            audio_scaling, rate = 32768.0 / 512, 16000
            audio_base_len = len(audio)

            meter = pyln.Meter(rate)  # create BS.1770 meter
            loudness = meter.integrated_loudness(audio)
            audio = pyln.normalize.loudness(audio, loudness, -40.0)

            stretch = 1
            pure_key = (audio_file, "pure_key")
            if pure_key not in duplicate_set:
                duplicate_set.add(pure_key)
            elif augment:
                # pitch = random.choice([-6, -4, -1, 1, 4, 6])
                # stretch = random.choice([0.85, 0.9, 0.95, 1.05, 1.1, 1.15])
                # pitch = random.choice([-4, -1, 1, 4])
                pitch = random.choice([-1, 0, 1])
                stretch = random.choice([0.9, 0.95, 1.05, 1.1])

                key_stretch = "time_stretch", stretch
                key_pitch = "pitch_shift", pitch, stretch

                duplication_key = (audio_file, key_pitch)
                if duplication_key in duplicate_set:
                    continue
                duplicate_set.add(duplication_key)

                def audio_pitch_shift():
                    return pyrb.pitch_shift(audio, rate, pitch)

                cache_audio = AudioCaching.get(audio_file, key_pitch)
                got_pitch = cache_audio is not None
                cache_audio = cache_audio if got_pitch else AudioCaching.get(audio_file, key_stretch)
                got_stretch = cache_audio is not None

                audio = cache_audio if cache_audio is not None else audio

                if not got_stretch:
                    audio = pyrb.time_stretch(audio, rate, stretch)
                    AudioCaching.set(audio_file, key_stretch, audio)

                if not got_pitch:
                    audio = pyrb.pitch_shift(audio, rate, pitch)
                    AudioCaching.set(audio_file, key_pitch, audio)

                stretch = len(audio) / audio_base_len

            fbank_feat = logfbank(
                audio, rate, winlen=WIN_SIZE, winstep=WIN_STEP, nfilt=INPUT_SIZE,
            )  # TODO: remove scaling
            mfcc_feat = mfcc(
                audio, rate, winlen=WIN_SIZE, winstep=WIN_STEP, nfilt=32, numcep=16,
            )  # TODO: remove scaling

            # some audio instances are too short for the audio transcription
            # and the winlen cut :(
            fbank_feat = np.vstack([fbank_feat] + [fbank_feat[-1]] * 10)
            mfcc_feat = np.vstack([mfcc_feat] + [mfcc_feat[-1]] * 10)

            step_size = WIN_STEP * 1000
            with open(label_file) as f:
                lines = list(f.readlines())
                length = fbank_feat.shape[0]
                length_ms = length * step_size
                labels = []
                ms_samples = 16

                for line in lines:
                    _, end, tag = line.split()
                    end_ms = float(end) / ms_samples * stretch
                    end_ms = min(end_ms, length_ms)
                    labels.append((end_ms, tag))

                length = int(end_ms / step_size)

            (tag_ints, tag_vecs, tag_mapping, transcription, transcription_ints) = self.process_audio(
                labels, length, step_size,
            )
            fbank_feat = fbank_feat[: len(tag_ints)]
            mfcc_feat = mfcc_feat[: len(tag_ints)]

            length = fbank_feat.shape[0]
            length_ms = length * step_size

            w = [200.0 / FOUND_LABELS[KNOWN_LABELS[_pid]] for _pid, _ms in tag_mapping]

            if i % 150 == 0:
                print(i)
                gc.collect()

            if length == len(tag_vecs) and length == len(tag_ints):
                original = stack([tag_vecs], torch.FloatTensor)[0].cpu().numpy()
                original_ids = np.argmax(original, axis=1)
                if MERGE_DOUBLES:
                    a, b, diff = find_borders(original_ids, tag_mapping)
                    d = abs(diff).max()
                    if d > 15:
                        print(
                            f"[DIFF-ERROR] diff is bigger {d} > 15", np.where(abs(diff) > 15), diff.shape,
                        )
                        print("\t", tag_mapping[-1], length_ms)
                        print("\t", np.round(a[-5:], 0))
                        print("\t", np.round(b[-5:], 0))
                        continue
                self.counts.append(length)
                tag_duration = []
                start = 0
                for _, end_ms in tag_mapping:
                    end_time = end_ms / DURATION_SCALER
                    tag_duration.append(end_time - start)
                    start = end_time  # CUMSUM vs DURATION

                pos, bor = pos_prep(torch.FloatTensor(tag_duration[:-1]).to(device))
                position.append(pos)
                border.append(bor)
                weight.append(w)

                out_dur.append(tag_duration)
                inp.append(fbank_feat)
                inp_mfcc.append(mfcc_feat)
                out_vec.append(tag_vecs)
                out_int.append(tag_ints)

                out_trans.append(transcription)
                out_trans_ints.append(transcription_ints)

                out_map.append(tag_mapping)
                self.files.append((label_file, audio_file))
            else:
                print(
                    f"[ERROR] len not match {length} != {len(tag_vecs)} != {len(tag_ints)} \n\t - {label_file}\n\t - {audio_file}",
                )

        self.inp = stack(inp, torch.FloatTensor)
        self.inp_mfcc = stack(inp_mfcc, torch.FloatTensor)

        self.out_vec = stack(out_vec, torch.FloatTensor)
        self.out_int = stack(out_int, torch.LongTensor)
        self.transcription = stack(out_trans, torch.FloatTensor)
        self.transcription_int = stack(out_trans_ints, torch.LongTensor)

        self.out_map = out_map
        self.out_duration = stack(out_dur, torch.FloatTensor)
        self.in_transcription = stack(out_trans, torch.FloatTensor)
        self.key = [uuid.uuid4().urn for i in range(len(inp))]
        self.position = position
        self.border = border
        self.weight = stack(weight, torch.FloatTensor)

        FEATURES = RawField(postprocessing=self.features_batch_process)
        FEATURES_MFCC = RawField(postprocessing=self.features_batch_process)
        LABEL = RawField(postprocessing=self.features_batch_process)
        TRANSCRIPTION_INT = RawField(postprocessing=self.features_batch_process)
        TRANSCRIPTION = RawField()
        LABEL_VEC = RawField()
        OUT_MAP = RawField()
        OUT_DUR = RawField(postprocessing=self.features_batch_process)
        IN_TRANS = RawField(postprocessing=self.features_batch_process)
        INDEX = RawField()
        KEY = RawField()
        POSITION = RawField(postprocessing=self.features_batch_process)
        BORDER = RawField(postprocessing=self.features_batch_process)
        WEIGHT = RawField(postprocessing=self.features_batch_process)

        setattr(FEATURES, "is_target", False)
        setattr(FEATURES_MFCC, "is_target", False)
        setattr(LABEL_VEC, "is_target", False)
        setattr(OUT_MAP, "is_target", False)
        setattr(TRANSCRIPTION, "is_target", False)
        setattr(TRANSCRIPTION_INT, "is_target", False)
        setattr(OUT_DUR, "is_target", False)
        setattr(IN_TRANS, "is_target", False)
        setattr(LABEL, "is_target", True)
        setattr(INDEX, "is_target", False)
        setattr(KEY, "is_target", False)
        setattr(POSITION, "is_target", False)
        setattr(BORDER, "is_target", False)
        setattr(WEIGHT, "is_target", False)

        self.fields = {
            "features": FEATURES,
            "features_mfcc": FEATURES_MFCC,
            "labels": LABEL,
            "transcription": TRANSCRIPTION,
            "transcription_int": TRANSCRIPTION_INT,
            "label_vec": LABEL_VEC,
            "out_map": OUT_MAP,
            "out_duration": OUT_DUR,
            "in_transcription": IN_TRANS,
            "index": INDEX,
            "key": KEY,
            "position": POSITION,
            "border": BORDER,
            "weight": WEIGHT,
        }
 def audio_pitch_shift():
     return pyrb.pitch_shift(audio, rate, pitch)
Example #27
0
 def shift_audio(self, semitones, data, sr):
     new_data = pyrb.pitch_shift(data, sr, semitones)
     return new_data
Example #28
0
for meta in tqdm(
        open('ProsodyLabeling/000001-010000.txt', mode='r',
             encoding='utf8').readlines()):
    line = meta.strip().split('\t')
    if len(line) == 1:
        continue
    file_name = line[0]
    file_text = line[1]
    audio_path = os.path.join('Wave', file_name + '.wav')
    shutil.copyfile(os.path.join('Wave', file_name + '.wav'),
                    os.path.join('sample', file_name + '.wav'))
    print(file_name + '\t' + file_text + '\n', file=meta_new)

    for j, i in enumerate(range(len(pitch_list) - 1)):
        pitch_num = random.uniform(pitch_list[i], pitch_list[i + 1])
        # print(pitch_num)
        new_file_name = file_name + '-' + str(j) + '.wav'
        y, sr = librosa.load(audio_path, sr=None)
        y_stretched = pyrubberband.pitch_shift(y, sr, pitch_num)
        # print(os.path.join('sample', new_file_name))
        if y.shape != y_stretched.shape:
            print('test')
        sf.write(os.path.join('sample', new_file_name),
                 y_stretched,
                 sr,
                 format='wav')
        print(file_name + '-' + str(j) + '\t' + file_text + '\n',
              file=meta_new)
    # exit()
    pass
Example #29
0
File: pitch.py Project: EQ4/muda
    def audio(mudabox, state):
        """Deform the audio"""

        mudabox._audio["y"] = pyrb.pitch_shift(mudabox._audio["y"], mudabox._audio["sr"], state["n_semitones"])
Example #30
0
 def audio(mudabox, state):
     mudabox._audio['y'] = pyrb.pitch_shift(mudabox._audio['y'],
                                            mudabox._audio['sr'],
                                            state['n_semitones'])
Example #31
0
    def generate_labels_features_voca(self, all_list):
        pid = os.getpid()
        mp3_config, feature_config, mp3_str, feature_str = self.config_to_folder(
        )

        i = 0  # number of songs
        j = 0  # number of impossible songs
        k = 0  # number of tried songs
        total = 0  # number of generated instances
        stretch_factors = [1.0]
        shift_factors = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6]

        loop_broken = False
        for song_name, lab_path, mp3_path, save_path in all_list:
            save_path = save_path + '_voca'

            # different song initialization
            if loop_broken:
                loop_broken = False

            i += 1
            print(pid, "generating features from ...", os.path.join(mp3_path))
            if i % 10 == 0:
                print(i, ' th song')

            original_wav, sr = librosa.load(os.path.join(mp3_path),
                                            sr=mp3_config['song_hz'])

            # save_path, mp3_string, feature_string, song_name, aug.pt
            result_path = os.path.join(save_path, mp3_str, feature_str,
                                       song_name.strip())
            if not os.path.exists(result_path):
                os.makedirs(result_path)

            # calculate result
            for stretch_factor in stretch_factors:
                if loop_broken:
                    loop_broken = False
                    break

                for shift_factor in shift_factors:
                    # for filename
                    idx = 0

                    try:
                        chord_info = self.Chord_class.get_converted_chord_voca(
                            os.path.join(lab_path))
                    except Exception as e:
                        print(e)
                        print(pid, " chord lab file error : %s" % song_name)
                        loop_broken = True
                        j += 1
                        break

                    k += 1
                    # stretch original sound and chord info
                    x = pyrb.time_stretch(original_wav, sr, stretch_factor)
                    x = pyrb.pitch_shift(x, sr, shift_factor)
                    audio_length = x.shape[0]
                    chord_info[
                        'start'] = chord_info['start'] * 1 / stretch_factor
                    chord_info['end'] = chord_info['end'] * 1 / stretch_factor

                    last_sec = chord_info.iloc[-1]['end']
                    last_sec_hz = int(last_sec * mp3_config['song_hz'])

                    if audio_length + mp3_config['skip_interval'] < last_sec_hz:
                        print('loaded song is too short :', song_name)
                        loop_broken = True
                        j += 1
                        break
                    elif audio_length > last_sec_hz:
                        x = x[:last_sec_hz]

                    origin_length = last_sec_hz
                    origin_length_in_sec = origin_length / mp3_config['song_hz']

                    current_start_second = 0

                    # get chord list between current_start_second and current+song_length
                    while current_start_second + mp3_config[
                            'inst_len'] < origin_length_in_sec:
                        inst_start_sec = current_start_second
                        curSec = current_start_second

                        chord_list = []
                        # extract chord per 1/self.time_interval
                        while curSec < inst_start_sec + mp3_config['inst_len']:
                            try:
                                available_chords = chord_info.loc[
                                    (chord_info['start'] <= curSec)
                                    & (chord_info['end'] > curSec +
                                       self.time_interval)].copy()
                                if len(available_chords) == 0:
                                    available_chords = chord_info.loc[(
                                        (chord_info['start'] >= curSec) &
                                        (chord_info['start'] <= curSec +
                                         self.time_interval)) | (
                                             (chord_info['end'] >= curSec) &
                                             (chord_info['end'] <= curSec +
                                              self.time_interval))].copy()

                                if len(available_chords) == 1:
                                    chord = available_chords['chord_id'].iloc[
                                        0]
                                elif len(available_chords) > 1:
                                    max_starts = available_chords.apply(
                                        lambda row: max(row['start'], curSec),
                                        axis=1)
                                    available_chords['max_start'] = max_starts
                                    min_ends = available_chords.apply(
                                        lambda row: min(
                                            row.end, curSec + self.
                                            time_interval),
                                        axis=1)
                                    available_chords['min_end'] = min_ends
                                    chords_lengths = available_chords[
                                        'min_end'] - available_chords[
                                            'max_start']
                                    available_chords[
                                        'chord_length'] = chords_lengths
                                    chord = available_chords.loc[
                                        available_chords['chord_length'].
                                        idxmax()]['chord_id']
                                else:
                                    chord = 169
                            except Exception as e:
                                chord = 169
                                print(e)
                                print(pid, "no chord")
                                raise RuntimeError()
                            finally:
                                # convert chord by shift factor
                                if chord != 169 and chord != 168:
                                    chord += shift_factor * 14
                                    chord = chord % 168

                                chord_list.append(chord)
                                curSec += self.time_interval

                        if len(chord_list
                               ) == self.no_of_chord_datapoints_per_sequence:
                            try:
                                sequence_start_time = current_start_second
                                sequence_end_time = current_start_second + mp3_config[
                                    'inst_len']

                                start_index = int(sequence_start_time *
                                                  mp3_config['song_hz'])
                                end_index = int(sequence_end_time *
                                                mp3_config['song_hz'])

                                song_seq = x[start_index:end_index]

                                etc = '%.1f_%.1f' % (current_start_second,
                                                     current_start_second +
                                                     mp3_config['inst_len'])
                                aug = '%.2f_%i' % (stretch_factor,
                                                   shift_factor)

                                if self.feature_name == FeatureTypes.cqt:
                                    feature = librosa.cqt(
                                        song_seq,
                                        sr=sr,
                                        n_bins=feature_config['n_bins'],
                                        bins_per_octave=feature_config[
                                            'bins_per_octave'],
                                        hop_length=feature_config['hop_length']
                                    )
                                else:
                                    raise NotImplementedError

                                if feature.shape[
                                        1] > self.no_of_chord_datapoints_per_sequence:
                                    feature = feature[:, :self.
                                                      no_of_chord_datapoints_per_sequence]

                                if feature.shape[
                                        1] != self.no_of_chord_datapoints_per_sequence:
                                    print(
                                        'loaded features length is too short :',
                                        song_name)
                                    loop_broken = True
                                    j += 1
                                    break

                                result = {
                                    'feature': feature,
                                    'chord': chord_list,
                                    'etc': etc
                                }

                                # save_path, mp3_string, feature_string, song_name, aug.pt
                                filename = aug + "_" + str(idx) + ".pt"
                                torch.save(result,
                                           os.path.join(result_path, filename))
                                idx += 1
                                total += 1
                            except Exception as e:
                                print(e)
                                print(pid, "feature error")
                                raise RuntimeError()
                        else:
                            print(
                                "invalid number of chord datapoints in sequence :",
                                len(chord_list))
                        current_start_second += mp3_config['skip_interval']
        print(pid, "total instances: %d" % total)
Example #32
0
 def shift(audio, amount):
     return pyrb.pitch_shift(audio, 16000, amount)
Example #33
-1
def write_audio_file(path, name, voice, audio, sampling_rate):
    file_name = path + \
        time.strftime("%Y%m%d-%H%M%S_") + name + str(randint(0, 100)) + ".wav"
    if voice == "satan:":
        temp_file_name = path + "temp.wav"
        write(temp_file_name, sampling_rate, audio)

        fixed_framerate = 11000
        sound = AudioSegment.from_file(temp_file_name)
        sound = sound.set_frame_rate(fixed_framerate)
        write(file_name, fixed_framerate, audio)

        y, sr = sf.read(file_name)
        y_stretch = pyrb.time_stretch(y, sr, 1.6)
        y_shift = pyrb.pitch_shift(y, sr, 1.6)
        sf.write(file_name, y_stretch, sr, format='wav')

        sound = AudioSegment.from_wav(file_name)
        sound.export(file_name, format="wav")
    elif voice == "vader:":
        temp_file_name = path + "temp.wav"
        write(temp_file_name, sampling_rate, audio)
        AudioEffect.robotic(temp_file_name, file_name)

        y, sr = sf.read(file_name)
        y_stretch = pyrb.time_stretch(y, sr, 0.9)
        y_shift = pyrb.pitch_shift(y, sr, 0.9)
        sf.write(file_name, y_stretch, sr, format='wav')

        sound = AudioSegment.from_wav(file_name)
        sound.export(file_name, format="wav")
    else:
        write(file_name, sampling_rate, audio)
    return file_name